diff --git a/util/pypi/go.mod b/util/pypi/go.mod new file mode 100644 index 00000000..9c7791c9 --- /dev/null +++ b/util/pypi/go.mod @@ -0,0 +1,7 @@ +module deps.dev/util/pypi + +go 1.23.4 + +replace deps.dev/util/semver => ../semver + +require deps.dev/util/semver v0.0.0-20241230231135-52b7655a522f diff --git a/util/pypi/metadata.go b/util/pypi/metadata.go new file mode 100644 index 00000000..33985e64 --- /dev/null +++ b/util/pypi/metadata.go @@ -0,0 +1,259 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pypi + +import ( + "bytes" + "context" + "fmt" + "io" + "log" + "net/mail" + "strings" + "unicode/utf8" + + "deps.dev/util/semver" +) + +// Metadata holds metadata for a distribution as defined in +// https://packaging.python.org/specifications/core-metadata/. +type Metadata struct { + // Name and Version are the only fields required by the spec. + // Taken directly from Metadata and not canonicalized. + Name, Version string + + // Optional metadata as defined by the spec. + Summary string + Description string + Homepage string + Author string + AuthorEmail string + Maintainer string + MaintainerEmail string + License string + Classifiers []string + ProjectURLs []string + + Dependencies []Dependency +} + +// ParseMetadata reads a METADATA or PKG-INFO file and collects as much +// information as possible. The earliest version of this format was a set of RFC +// 822 headers (see https://www.python.org/dev/peps/pep-0241/) with later +// versions (https://www.python.org/dev/peps/pep-0566/) adding the ability to +// include a message body rendering the format essentially the same as an email. +// The latest specification is here: +// https://packaging.python.org/en/latest/specifications/core-metadata/. For +// reference distlib, the library used by pip for this job, uses python's +// standard library email reader to read these files (see +// https://bitbucket.org/pypa/distlib/src/default/distlib/metadata.py). The +// current version of the specification requires metadata to be encoded as +// UTF-8, so an error will be returned if any invalid UTF-8 is discovered. +func ParseMetadata(ctx context.Context, data string) (Metadata, error) { + if !utf8.ValidString(data) { + // TODO: maybe we could be a bit more lenient to support + // older packages. + return Metadata{}, parseErrorf("invalid UTF-8") + } + // Add a newline to the end; some files have no body which is an error to + // net/mail. Adding a newline ensures it will parse an empty body. + buf := bytes.NewBufferString(data) + buf.WriteByte('\n') + msg, err := mail.ReadMessage(buf) + if err != nil { + return Metadata{}, parseErrorf("parsing python metadata: %v", err) + } + md := Metadata{} + + header := func(name string) (value string) { + vs := msg.Header[name] + if len(vs) > 1 { + log.Printf("Header set multiple times: %q: %q", name, vs) + } + if len(vs) == 1 && vs[0] != "UNKNOWN" { + value = vs[0] + } + return + } + multiHeader := func(name string) (values []string) { + for _, v := range msg.Header[name] { + if v != "UNKNOWN" { + values = append(values, v) + } + } + return + } + + // Dependencies need some parsing and will always be needed. + for _, d := range msg.Header["Requires-Dist"] { + dep, err := ParseDependency(d) + if err != nil { + return Metadata{}, err + } + md.Dependencies = append(md.Dependencies, dep) + } + + md.Name = header("Name") + md.Version = header("Version") + md.Summary = header("Summary") + md.Description = header("Description") + md.Homepage = header("Home-Page") + md.Author = header("Author") + md.AuthorEmail = header("Author-Email") + md.Maintainer = header("Maintainer") + md.MaintainerEmail = header("Maintainer-Email") + md.License = header("License") + md.ProjectURLs = multiHeader("Project-Url") + md.Classifiers = multiHeader("Classifier") + + // The description may be in the message body. + body, err := io.ReadAll(msg.Body) + if err != nil { + return Metadata{}, parseErrorf("reading metadata description: %v", err) + } + if len(body) > 0 { + // Remove the extra line we added earlier to ensure a valid message. + body = body[:len(body)-1] + md.Description = string(body) + } + return md, nil +} + +// Dependency is a dependency on a package. +type Dependency struct { + Name string + Extras string + Constraint string + Environment string +} + +// ParseDependency parses a python requirement statement according to PEP 508 +// (https://www.python.org/dev/peps/pep-0508/), apart from URL requirements. +func ParseDependency(v string) (Dependency, error) { + var d Dependency + if v == "" { + return d, parseErrorf("invalid python requirement: empty string") + } + const whitespace = " \t" // according to the PEP this is the only allowed whitespace + s := strings.Trim(v, whitespace) + // For our purposes, the name is some characters ending with space or the + // start of something else. + nameEnd := strings.IndexAny(s, whitespace+"[(;<=!~>") + if nameEnd == 0 { + return d, parseErrorf("invalid python requirement: empty name") + } + if nameEnd < 0 { + d.Name = CanonPackageName(s) + return d, nil + } + d.Name = CanonPackageName(s[:nameEnd]) + s = strings.TrimLeft(s[nameEnd:], whitespace) + // Does it have extras? + if s[0] == '[' { + end := strings.IndexByte(s, ']') + if end < 0 { + return d, parseErrorf("invalid python requirement: %q has unterminated extras section", v) + } + // Extract whatever is inside the [] + d.Extras = strings.Trim(s[1:end], whitespace) + s = s[end+1:] + } + // Does it have a constraint? + if len(s) > 0 && s[0] != ';' { + end := strings.IndexByte(s, ';') + if end < 0 { + end = len(s) // all of the remainder is the constraint + } + d.Constraint = strings.Trim(s[:end], whitespace) + // May be parenthesized, we can remove those. + if strings.HasPrefix(d.Constraint, "(") && strings.HasSuffix(d.Constraint, ")") { + d.Constraint = d.Constraint[1 : len(d.Constraint)-1] + } + s = s[end:] + } + // Anything left must be a condition starting with ';'. Otherwise there should + // be no way for s to be non-empty. If it is something's wrong, that's an + // error. + if len(s) > 0 && s[0] != ';' { + return d, parseErrorf("invalid python requirement: internal parse error on %q", v) + } + if s != "" { + d.Environment = strings.Trim(s[1:], whitespace) // s[1] == ';' + } + return d, nil +} + +// CanonVersion canonicalizes a version string. If the version does not parse +// according to PEP 440 it is returned as-is. +func CanonVersion(ver string) string { + v, err := semver.PyPI.Parse(ver) + if err != nil { + return ver + } + return v.Canon(true) +} + +// CanonPackageName returns the canonical form of the given PyPI package name. +func CanonPackageName(name string) string { + // https://github.com/pypa/pip/blob/20.0.2/src/pip/_vendor/packaging/utils.py + // https://www.python.org/dev/peps/pep-0503/ + // Names may only be [-_.A-Za-z0-9]. + // Replace runs of [-_.] with a single "-", then lowercase everything. + var out bytes.Buffer + run := false // whether a run of [-_.] has started. + for i := 0; i < len(name); i++ { + switch c := name[i]; { + case 'a' <= c && c <= 'z', '0' <= c && c <= '9': + out.WriteByte(c) + run = false + case 'A' <= c && c <= 'Z': + out.WriteByte(c + ('a' - 'A')) + run = false + case c == '-' || c == '_' || c == '.': + if !run { + out.WriteByte('-') + } + run = true + default: + run = false + } + } + return out.String() +} + +// ParseError is returned when we encounter data that fails to parse. +type ParseError struct { + msg string +} + +func (p ParseError) Error() string { + return p.msg +} + +// parseErrorf constructs a pypiParseError with a formatted message. +func parseErrorf(format string, args ...any) ParseError { + return ParseError{msg: fmt.Sprintf(format, args...)} +} + +// UnsupportedError is an error used to indicate when we encounter types of +// packaging that we can not yet handle. +type UnsupportedError struct { + msg string + packageType string +} + +func (p UnsupportedError) Error() string { + return fmt.Sprintf("%s: %s", p.packageType, p.msg) +} diff --git a/util/pypi/metadata_test.go b/util/pypi/metadata_test.go new file mode 100644 index 00000000..9f3c8522 --- /dev/null +++ b/util/pypi/metadata_test.go @@ -0,0 +1,266 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pypi + +import ( + "context" + "errors" + "reflect" + "testing" +) + +var numpyPkgInfoRaw = `Metadata-Version: 1.2 +Name: numPy +Version: 1.16.4 +Summary: NumPy is the fundamental package for array computing with Python. +Home-page: https://www.numpy.org +Author: NumPy Developers +Author-email: numpy-discussion@python.org +License: BSD +Download-URL: https://pypi.python.org/pypi/numpy +Description-Content-Type: UNKNOWN +Description: It provides: + + - a powerful N-dimensional array object + - sophisticated... + +Platform: Windows +Platform: Linux +Platform: Solaris +Platform: Mac OS-X +Platform: Unix +Classifier: Development Status :: 5 - Production/Stable +Classifier: License :: OSI Approved +Classifier: Programming Language :: C +Classifier: Programming Language :: Python +Classifier: Programming Language :: Python :: Implementation :: CPython +Classifier: Topic :: Software Development +Classifier: Topic :: Scientific/Engineering +Classifier: Operating System :: Microsoft :: Windows +Classifier: Operating System :: POSIX +Classifier: Operating System :: Unix +Classifier: Operating System :: MacOS +Requires-Python: >=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.* +Project-URL: Homepage, https://www.numpy.org +` + +var numpyPkgInfo = Metadata{ + Name: "numPy", + Version: "1.16.4", + Summary: "NumPy is the fundamental package for array computing with Python.", + Description: "It provides: - a powerful N-dimensional array object - sophisticated... ", + Homepage: "https://www.numpy.org", + Author: "NumPy Developers", + AuthorEmail: "numpy-discussion@python.org", + License: "BSD", + Classifiers: []string{ + "Development Status :: 5 - Production/Stable", + "License :: OSI Approved", + "Programming Language :: C", + "Programming Language :: Python", + "Programming Language :: Python :: Implementation :: CPython", + "Topic :: Software Development", + "Topic :: Scientific/Engineering", + "Operating System :: Microsoft :: Windows", + "Operating System :: POSIX", + "Operating System :: Unix", + "Operating System :: MacOS", + }, + ProjectURLs: []string{"Homepage, https://www.numpy.org"}, +} + +// A real life METADATA file from a wheel, with the description in the body. +var numbaMetadataRaw = `Metadata-Version: 2.1 +Name: Numba +Version: 0.44.0 +Summary: compiling Python code using LLVM +Home-page: https://github.com/numba/numba +Author: Anaconda, Inc. +Author-email: numba-users@continuum.io +License: BSD +Platform: UNKNOWN +Requires-Dist: llvmlite (>=0.29.0) +Requires-Dist: numpy +Requires-Dist: funcsigs; python_version < "3.3" +Requires-Dist: enum34; python_version < "3.4" +Requires-Dist: singledispatch; python_version < "3.4" + +***** +Numba +***** + +.. image:: https://badges.gitter.im/numba/numba.svg + :target: https://gitter.im/numba/numba?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge + :alt: Gitter + +A Just-In-Time Compiler for Numerical Functions in Python +######################################################### + +Numba is an open source, +` + +var numbaMetadataParsed = Metadata{ + Name: "Numba", + Version: "0.44.0", + Summary: "compiling Python code using LLVM", + Description: "*****\nNumba\n*****\n\n.. image:: https://badges.gitter.im/numba/numba.svg\n :target: https://gitter.im/numba/numba?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge\n :alt: Gitter\n\nA Just-In-Time Compiler for Numerical Functions in Python\n#########################################################\n\nNumba is an open source,\n", + Homepage: "https://github.com/numba/numba", + Author: "Anaconda, Inc.", + AuthorEmail: "numba-users@continuum.io", + License: "BSD", + Dependencies: []Dependency{ + {"llvmlite", "", ">=0.29.0", ""}, + {"numpy", "", "", ""}, + {"funcsigs", "", "", "python_version < \"3.3\""}, + {"enum34", "", "", "python_version < \"3.4\""}, + {"singledispatch", "", "", "python_version < \"3.4\""}, + }, +} + +// badPyPIMetadata contains some invalid metadata that should trigger a parse +// error. +var badPyPIMetadata = []string{ + // Missing bracket in the requirement. + `Metadata-Version: 2.1 +Name: numba +Version: 0.44.0 +Summary: compiling Python code using LLVM +Requires-Dist: llvmlite[banana (>=0.29.0) + +***** +Numba +`, + // Incorrect line folding. + `Metadata-Version: 2.1 +Name: numba +Version: 0.44.0 +Summary: compiling Python code using LLVM +License: A long license that require +many lines to express. +Yes. +Requires-Dist: llvmlite (>=0.29.0) +`, + // Invalid UTF-8, uses an ISO-8859 non-breaking space. + `Metadata-Version: 2.1 +Name: numba +Version: 0.44.0 +Summary: compiling Python` + string([]byte{0xA0}) + ` code using LLVM +`, +} + +func TestParseMetadata(t *testing.T) { + ctx := context.Background() + + // real examples we want to be able to parse + got, err := ParseMetadata(ctx, numpyPkgInfoRaw) + if err != nil { + t.Errorf("Parsing numpy metadata: %v", err) + } + if !reflect.DeepEqual(got, numpyPkgInfo) { + t.Errorf("numpy metadata:\n got: %#v\nwant: %#v", got, numpyPkgInfo) + } + got, err = ParseMetadata(ctx, numbaMetadataRaw) + if err != nil { + t.Errorf("Parsing numba metadata: %v", err) + } + if !reflect.DeepEqual(got, numbaMetadataParsed) { + t.Errorf("numba metadata:\n got: %#v\nwant: %#v", got, numbaMetadataParsed) + } + for i, md := range badPyPIMetadata { + got, err := ParseMetadata(ctx, md) + var pErr ParseError + if ok := errors.As(err, &pErr); !ok { + t.Errorf("Parsing bad metadata %d: got: (%v, %#v), want ParseError", i, got, err) + } + } +} + +func TestParseDependency(t *testing.T) { + for _, c := range []struct { + r string + w *Dependency + }{ + // Cases we do handle. + // plain names: + {"plain", &Dependency{"plain", "", "", ""}}, + {"colon;", &Dependency{"colon", "", "", ""}}, + {" leading-space", &Dependency{"leading-space", "", "", ""}}, + {"trailing-space\t", &Dependency{"trailing-space", "", "", ""}}, + // extras: + {"empty-extra[]", &Dependency{"empty-extra", "", "", ""}}, + {"spaced\t[hello ] ", &Dependency{"spaced", "hello", "", ""}}, + {"extra[more]", &Dependency{"extra", "more", "", ""}}, + {"extras[even, more]", &Dependency{"extras", "even, more", "", ""}}, + // bare constraints, including with non-canonical names: + {"constraint >=2.1.2", &Dependency{"constraint", "", ">=2.1.2", ""}}, + {"Multi ~=3.6, !=3.8.1", &Dependency{"multi", "", "~=3.6, !=3.8.1", ""}}, + {"no_space>=1,!=3.4", &Dependency{"no-space", "", ">=1,!=3.4", ""}}, + // conditions: + {"condition;python_version < \"3.6\"", &Dependency{"condition", "", "", "python_version < \"3.6\""}}, + {"space_condition ; platform_machine == x86_64", &Dependency{"space-condition", "", "", "platform_machine == x86_64"}}, + // combinations: + {"extra-constraint[more] ==2.0", &Dependency{"extra-constraint", "more", "==2.0", ""}}, + {"extra-condition[stuff]; implementation_name == cpython", &Dependency{"extra-condition", "stuff", "", "implementation_name == cpython"}}, + {"constraint-condition <1.0.0-alpha; extra == \"stuff\"", &Dependency{"constraint-condition", "", "<1.0.0-alpha", "extra == \"stuff\""}}, + {"alltheabove[all,the,things] >=0.0; python_version >= 2.0", &Dependency{"alltheabove", "all,the,things", ">=0.0", "python_version >= 2.0"}}, + {"parens (!=2.0)", &Dependency{"parens", "", "!=2.0", ""}}, + + // unsalvageable errors: + {"", nil}, + {";", nil}, + {"unterminated[something >2.1", nil}, + } { + t.Run(c.r, func(t *testing.T) { + r, err := ParseDependency(c.r) + if err != nil { + if c.w != nil { + t.Errorf("want %q to parse: got %#v", c.r, err) + } + return + } + if c.w == nil { + t.Errorf("want %q to fail: got %#v", c.r, r) + return + } + if !reflect.DeepEqual(c.w, &r) { + t.Errorf("parse %q: want: %#v, got: %#v", c.r, c.w, r) + } + }) + } +} + +func TestCanonPackageName(t *testing.T) { + tests := []struct { + in, out string + }{ + // Test cases from https://github.com/pypa/packaging/blob/20.0/tests/test_utils.py. + {"foo", "foo"}, + {"Foo", "foo"}, + {"fOo", "foo"}, + {"foo.bar", "foo-bar"}, + {"Foo.Bar", "foo-bar"}, + {"Foo.....Bar", "foo-bar"}, + {"foo_bar", "foo-bar"}, + {"foo___bar", "foo-bar"}, + {"foo-bar", "foo-bar"}, + {"foo----bar", "foo-bar"}, + {"foo-Տ", "foo-"}, // Strip out non-ASCII + } + for _, test := range tests { + if got := CanonPackageName(test.in); got != test.out { + t.Errorf("CanonPackageName(%s): got %s, want %s", test.in, got, test.out) + } + } +} diff --git a/util/pypi/sdist.go b/util/pypi/sdist.go new file mode 100644 index 00000000..2e2d25dd --- /dev/null +++ b/util/pypi/sdist.go @@ -0,0 +1,195 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pypi + +import ( + "archive/tar" + "bufio" + "bytes" + "compress/gzip" + "context" + "fmt" + "io" + "path/filepath" + "regexp" + "strings" +) + +// SdistVersion attempts to extract the version from the name of an sdist file. +// The format of the names is not standardized, but it is a strong enough +// convention that pip relies on it (see +// https://github.com/pypa/pip/blob/0442875a68f19b0118b0b88c747bdaf6b24853ba/src/pip/_internal/index/package_finder.py#L978). +// The filenames are formatted <name>-<version>, where the name is not +// necessarily canonicalized. The returned version will be canonicalized if +// possible. +func SdistVersion(canonName, filename string) (string, string, error) { + // Take every substring ending in "-" and see if it canonicalizes to the + // name we are looking for. + // Start by trimming the extension. + nameVersion := strings.TrimSuffix(filename, filepath.Ext(filename)) + // .tar.gz sdists have two extensions, make sure to trim .tar. + nameVersion = strings.TrimSuffix(nameVersion, ".tar") + for i, r := range nameVersion { + if r != '-' { + continue + } + name := CanonPackageName(nameVersion[:i]) + if name == canonName { + return nameVersion[:i], nameVersion[i+1:], nil + } + } + return "", "", fmt.Errorf("invalid filename for package %q: %q", canonName, filename) +} + +// Regular expression indicating a setup.py or setup.cfg specifies dependencies. +// There may be some false positives: a line could be commented out or not in +// the right place. There will be no false negatives; to specify dependencies +// there must be at least one match for this pattern. +var installRequiresPattern = regexp.MustCompile(`install_requires[ \t]*=`) + +// SdistMetadata attempts to read metadata out of the supplied reader assuming +// it contains an sdist. The reader should be either a tar or a zip file, +// the extension of the supplied filename will be used to distinguish. +func SdistMetadata(ctx context.Context, fileName string, r io.Reader) (*Metadata, error) { + // setupPy and setupCFG indicate whether we have found dependency information + // in a setup.py or setup.cfg. + setupPy, setupCFG := false, false + var meta Metadata + + walkFn := func(name string, r io.Reader) error { + _, name, ok := strings.Cut(name, "/") + if !ok { + return nil + } + if name == "setup.py" && !setupPy { + setupPy = installRequiresPattern.MatchReader(bufio.NewReader(r)) + return nil + } + if name == "setup.cfg" && !setupCFG { + setupCFG = installRequiresPattern.MatchReader(bufio.NewReader(r)) + return nil + } + if name != "PKG-INFO" { + return nil + } + if meta.Name != "" { + // Multiple top level PKG-INFO is only possible if the contains multiple + // packages. This is invalid and therefore unsupported. + return UnsupportedError{ + msg: "multiple top level PKG-INFO", + packageType: "sdist", + } + } + contents, err := io.ReadAll(r) + if err != nil { + return err + } + md, err := ParseMetadata(ctx, string(contents)) + if err != nil { + return err + } + meta.Name = md.Name + meta.Version = md.Version + meta.Summary = md.Summary + meta.Description = md.Description + meta.Homepage = md.Homepage + meta.Author = md.Author + meta.AuthorEmail = md.AuthorEmail + meta.Maintainer = md.Maintainer + meta.MaintainerEmail = md.MaintainerEmail + meta.License = md.License + meta.Classifiers = md.Classifiers + meta.ProjectURLs = md.ProjectURLs + if len(meta.Dependencies) == 0 { + meta.Dependencies = md.Dependencies + } + return nil + } + switch { + case strings.HasSuffix(fileName, ".tar.gz"), + strings.HasSuffix(fileName, ".tgz"): + tgz, err := gzip.NewReader(r) + if err != nil { + return nil, err + } + defer tgz.Close() + if err := walkTarFiles(tgz, walkFn); err != nil { + return nil, err + } + case strings.HasSuffix(fileName, ".zip"): + // TODO: try and avoid this. + contents, err := io.ReadAll(r) + if err != nil { + return nil, err + } + if err := walkZipFiles(bytes.NewReader(contents), int64(len(contents)), walkFn); err != nil { + return nil, err + } + default: + return nil, UnsupportedError{ + msg: fmt.Sprintf("unsupported sdist format: %s", fileName), + packageType: "sdist", + } + } + if meta.Name == "" { + return nil, UnsupportedError{ + msg: "no PKG-INFO", + packageType: "sdist", + } + } + if len(meta.Dependencies) == 0 { + switch { + // If we found no dependencies in PKG-INFO but saw an + // install_requires line in a setup.py or setup.cfg file then + // report and error; we can't handle those dependencies yet. + case setupCFG: + return nil, UnsupportedError{ + msg: "dependencies in setup.cfg, not in PKG-INFO", + packageType: "sdist", + } + case setupPy: + return nil, UnsupportedError{ + msg: "dependencies in setup.py, not in PKG-INFO", + packageType: "sdist", + } + default: + // It genuinely has no dependencies. + } + } + return &meta, nil +} + +// walkTarFiles walks through the files in a tar archive, applying the given +// function one at a time to the name of the file and a reader containing its +// contents until all files have been visited or the first error. +func walkTarFiles(r io.Reader, f func(string, io.Reader) error) error { + tfr := tar.NewReader(r) + for { + h, err := tfr.Next() + if err == io.EOF { + break + } + if err != nil { + return err + } + if h.Typeflag != tar.TypeReg { + continue + } + if err := f(h.Name, tfr); err != nil { + return err + } + } + return nil +} diff --git a/util/pypi/sdist_test.go b/util/pypi/sdist_test.go new file mode 100644 index 00000000..1f361a68 --- /dev/null +++ b/util/pypi/sdist_test.go @@ -0,0 +1,210 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pypi + +import ( + "archive/tar" + "archive/zip" + "bytes" + "compress/gzip" + "context" + "errors" + "io" + "reflect" + "sort" + "testing" + "time" +) + +func tarfile(t *testing.T, files map[string]string) []byte { + var buf bytes.Buffer + tfw := tar.NewWriter(&buf) + for name, contents := range files { + byteContents := []byte(contents) + hdr := &tar.Header{ + Name: name, + Size: int64(len(byteContents)), + ModTime: time.Now(), + } + if err := tfw.WriteHeader(hdr); err != nil { + t.Fatal(err) + } + if _, err := tfw.Write(byteContents); err != nil { + t.Fatal(err) + } + } + if err := tfw.Close(); err != nil { + t.Fatal(err) + } + return buf.Bytes() +} + +func targzfile(t *testing.T, files map[string]string) []byte { + tf := tarfile(t, files) + var buf bytes.Buffer + gzw := gzip.NewWriter(&buf) + if _, err := gzw.Write(tf); err != nil { + t.Fatal(err) + } + if err := gzw.Close(); err != nil { + t.Fatal(err) + } + return buf.Bytes() +} + +func zipfile(t *testing.T, files map[string]string) []byte { + var buf bytes.Buffer + zw := zip.NewWriter(&buf) + var names []string + for n := range files { + names = append(names, n) + } + sort.Strings(names) + for _, name := range names { + w, err := zw.Create(name) + if err != nil { + t.Fatal(err) + } + if _, err := io.WriteString(w, files[name]); err != nil { + t.Fatal(err) + } + } + if err := zw.Close(); err != nil { + t.Fatal(err) + } + return buf.Bytes() +} + +func TestSdistMetadata(t *testing.T) { + ctx := context.Background() + + cases := []struct { + files map[string]string + want *Metadata + unsupported string + }{ + { + files: map[string]string{ + "test-1.1.1/": "", + "test-1.1.1/file-to-ignore.txt": "this is boring", + "test-1.1.1/PKG-INFO": numpyPkgInfoRaw, + "test-1.1.1/test.egg-info/PKG-INFO": numbaMetadataRaw, + }, + want: &numpyPkgInfo, + }, + { + files: map[string]string{ + "test-1.1.2/PKG-INFO": numpyPkgInfoRaw, + "test-1.1.2/test.egg-info/requires.txt": "requirement-a\nrequirement-b\n", + }, + want: &Metadata{ + Name: numpyPkgInfo.Name, + Version: numpyPkgInfo.Version, + Summary: numpyPkgInfo.Summary, + Description: numpyPkgInfo.Description, + Homepage: numpyPkgInfo.Homepage, + Author: numpyPkgInfo.Author, + AuthorEmail: numpyPkgInfo.AuthorEmail, + Maintainer: numpyPkgInfo.Maintainer, + MaintainerEmail: numpyPkgInfo.MaintainerEmail, + License: numpyPkgInfo.License, + Classifiers: numpyPkgInfo.Classifiers, + ProjectURLs: numpyPkgInfo.ProjectURLs, + // requirements only in the + // egg-info/requires.txt should be ignored. + Dependencies: nil, + }, + }, + // No PKG-INFO is an error + { + files: map[string]string{ + "test-1.1.1/METADATA": numbaMetadataRaw, + "test-1.1.1/setup.py": "print('hello, test')", + "test-1.1.1/test/__init__.py": "\n", + }, + unsupported: "no PKG-INFO", + }, + // Ensure cases that have dependencies that are not specified in a way we + // understand but are otherwise valid give an appropriate error. + { + files: map[string]string{ + "test-1.1.3/PKG-INFO": numpyPkgInfoRaw, + "test-1.1.3/setup.cfg": "[options]\ninstall_requires = \n requirement-a\n requirement-b\n", + }, + unsupported: "setup.cfg", + }, + { + files: map[string]string{ + "test-1.1.4/PKG-INFO": numpyPkgInfoRaw, + "test-1.1.4/setup.py": "from setuptools import setup\n\nsetup(\n install_requires=['requirement-a', 'requirement-b']\n )\n", + }, + unsupported: "setup.py", + }, + { + files: map[string]string{ + "double-a/PKG-INFO": numpyPkgInfoRaw, + "double-b/PKG-INFO": numpyPkgInfoRaw, + }, + unsupported: "multiple PKG-INFO", + }, + } + // tar.gz files + for _, c := range cases { + tf := targzfile(t, c.files) + if c.unsupported != "" { + unsupportedSdist(ctx, t, tf, "test-1.0.tar.gz", c.unsupported) + continue + } + got, err := SdistMetadata(ctx, "test-0.0.1.tar.gz", bytes.NewBuffer(tf)) + if err != nil { + t.Fatal(err) + } + if !reflect.DeepEqual(got, c.want) { + t.Errorf("sdist tar metadata: files:\n%+v\n got: %#v\nwant: %#v", c.files, got, c.want) + } + } + // zip files + for _, c := range cases { + tf := zipfile(t, c.files) + if c.unsupported != "" { + unsupportedSdist(ctx, t, tf, "test-1.0.zip", c.unsupported) + continue + } + got, err := SdistMetadata(ctx, "test-0.0.1.zip", bytes.NewBuffer(tf)) + if err != nil { + t.Fatal(err) + } + if !reflect.DeepEqual(got, c.want) { + t.Errorf("sdist zip metadata: files:\n%+v\n got: %#v\nwant: %#v", c.files, got, c.want) + } + } + // Unsupported formats. + unsupportedSdist(ctx, t, []byte("this is a bz2"), "test-0.0.1.tar.bz2", "bz2 archive") + unsupportedSdist(ctx, t, []byte("xz yay"), "test-0.0.1.tar.xz", "xz archive") + unsupportedSdist(ctx, t, []byte("big z"), "test-0.0.1.tar.Z", "Z archive") + // TODO: support the following, it is simpler than the tar.gz we do + // already + unsupportedSdist(ctx, t, []byte("raw tar"), "test-0.0.1.tar", "uncompressed tar") +} + +func unsupportedSdist(ctx context.Context, t *testing.T, data []byte, name, msg string) { + t.Helper() + var uerr UnsupportedError + if got, err := SdistMetadata(ctx, name, bytes.NewBuffer(data)); err == nil { + t.Errorf("%s: want error from unsupported sdist format, got:\nmetadata:\n%+v", msg, got) + } else if ok := errors.As(err, &uerr); !ok { + t.Errorf("%s: want: pypiUnsupportedError, got: %T", msg, err) + } +} diff --git a/util/pypi/wheel.go b/util/pypi/wheel.go new file mode 100644 index 00000000..937a56ec --- /dev/null +++ b/util/pypi/wheel.go @@ -0,0 +1,187 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pypi + +import ( + "archive/zip" + "context" + "fmt" + "io" + "strconv" + "strings" + "unicode" +) + +// WheelInfo holds all of the information kept in the name of a wheel file. +type WheelInfo struct { + Name string + Version string + BuildTag WheelBuildTag + Platforms []PEP425Tag +} + +// WheelBuildTag holds the components of a wheel's build tag. +type WheelBuildTag struct { + Num int + Tag string +} + +// PEP425Tag holds a compatibility tag defined in +// https://www.python.org/dev/peps/pep-0425/ +type PEP425Tag struct { + Python string + ABI string + Platform string +} + +// ParseWheelName extracts all of the information in the name of a wheel. The +// wheel naming format is described in PEP 427 +// (https://www.python.org/dev/peps/pep-0427/#file-name-convention). The name +// and version will always be canonicalized if possible. +func ParseWheelName(name string) (*WheelInfo, error) { + if !strings.HasSuffix(name, ".whl") { + return nil, fmt.Errorf("not a wheel filename: %q", name) + } + // Strip the suffix + name = name[:len(name)-4] + parts := strings.Split(name, "-") + if len(parts) != 5 && len(parts) != 6 { + return nil, fmt.Errorf("wheel name %q has %d elements, not 5 or 6", name, len(parts)) + } + pwi := &WheelInfo{ + Name: parts[0], + Version: parts[1], + } + if len(parts) == 6 { + buildTag := parts[2] + split := strings.IndexFunc(buildTag, func(r rune) bool { + return !unicode.IsDigit(r) + }) + if split == 0 { // Must start with at least one digit. + return nil, fmt.Errorf("invalid wheel name %q: build tag %q does not start with digit", name, buildTag) + } else if split == -1 { + split = len(buildTag) + } + num, err := strconv.Atoi(buildTag[:split]) + if err != nil { + return nil, fmt.Errorf("invalid wheel name %q: %v", name, err) + } + pwi.BuildTag.Num = num + pwi.BuildTag.Tag = buildTag[split:] + } + tag := PEP425Tag{ + Python: parts[len(parts)-3], + ABI: parts[len(parts)-2], + Platform: parts[len(parts)-1], + } + pwi.Platforms = expandPEP425Tag(tag) + return pwi, nil +} + +// WheelMetadata extracts the metadata from a wheel file. The file format is +// defined in PEP 427 (https://www.python.org/dev/peps/pep-0427/#file-format) +// and is relatively simple compared to sdists. In particular: wheels can not +// have a setup.py or setup.cfg and the metadata version must be 1.1 or greater. +// This means that the metadata definitely supports dependencies and there is +// nowhere else to specify them. +func WheelMetadata(ctx context.Context, r io.ReaderAt, size int64) (*Metadata, error) { + var meta *Metadata + err := walkZipFiles(r, size, func(name string, r io.Reader) error { + // Metadata lives in <package-name>-<version>.dist-info/METADATA. + dir, name, ok := strings.Cut(name, "/") + if !ok { + return nil + } + if !strings.HasSuffix(dir, ".dist-info") { + return nil + } + if name != "METADATA" { + return nil + } + if meta != nil { + return UnsupportedError{ + msg: "multiple METADATA files", + packageType: "wheel", + } + } + b, err := io.ReadAll(r) + if err != nil { + return err + } + md, err := ParseMetadata(ctx, string(b)) + if err != nil { + return err + } + meta = &md + return nil + }) + if err != nil { + return nil, err + } + if meta == nil { + return nil, UnsupportedError{ + msg: "no METADATA file", + packageType: "wheel", + } + } + return meta, nil +} + +// expandPEP425Tag expands any compressed tag sets in the given tag to produce +// the full set of supported systems. It uses the algorithm described in the PEP +// (https://www.python.org/dev/peps/pep-0425/#compressed-tag-sets). Note this +// can generate a fair number of impossible tags that are not supported by any +// actual Python implementation. +func expandPEP425Tag(tag PEP425Tag) []PEP425Tag { + var allTags []PEP425Tag + for _, py := range strings.Split(tag.Python, ".") { + for _, abi := range strings.Split(tag.ABI, ".") { + for _, plat := range strings.Split(tag.Platform, ".") { + allTags = append(allTags, PEP425Tag{ + Python: py, + ABI: abi, + Platform: plat, + }) + } + } + } + return allTags +} + +// walkZipFiles walks through the files in a zip archive, applying the given +// function one at a time to the name of the file and a reader containing its +// contents until all files have been visited or the first error. Unfortunately +// there is no clear way to avoid loading the whole file into memory; zip files +// store their file listings at the end so it is not necessarily possible to +// process them sequentially. +func walkZipFiles(r io.ReaderAt, size int64, callback func(string, io.Reader) error) error { + zr, err := zip.NewReader(r, size) + if err != nil { + return err + } + for _, f := range zr.File { + rc, err := f.Open() + if err != nil { + return err + } + if err := callback(f.Name, rc); err != nil { + return err + } + if err := rc.Close(); err != nil { + return err + } + } + return nil +} diff --git a/util/pypi/wheel_test.go b/util/pypi/wheel_test.go new file mode 100644 index 00000000..62059653 --- /dev/null +++ b/util/pypi/wheel_test.go @@ -0,0 +1,161 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pypi + +import ( + "reflect" + "testing" +) + +func TestParseWheelName(t *testing.T) { + // TODO: Should the pyWheelInfo.Name fields go through canon.PackageName? + cases := []struct { + in string + out *WheelInfo + }{ + { + in: "generic-0.0.1-py2.py3-none-any.whl", + out: &WheelInfo{ + Name: "generic", + Version: "0.0.1", + Platforms: []PEP425Tag{{ + Python: "py2", + ABI: "none", + Platform: "any", + }, { + Python: "py3", + ABI: "none", + Platform: "any", + }}, + }, + }, + { + in: "very_generic-0.0.2-cp3.cp2-cp3m.cp2m-win_amd64.win32.whl", + out: &WheelInfo{ + Name: "very_generic", + Version: "0.0.2", + Platforms: []PEP425Tag{{ + Python: "cp3", + ABI: "cp3m", + Platform: "win_amd64", + }, { + Python: "cp3", + ABI: "cp3m", + Platform: "win32", + }, { + Python: "cp3", + ABI: "cp2m", + Platform: "win_amd64", + }, { + Python: "cp3", + ABI: "cp2m", + Platform: "win32", + }, { + Python: "cp2", + ABI: "cp3m", + Platform: "win_amd64", + }, { + Python: "cp2", + ABI: "cp3m", + Platform: "win32", + }, { + Python: "cp2", + ABI: "cp2m", + Platform: "win_amd64", + }, { + Python: "cp2", + ABI: "cp2m", + Platform: "win32", + }}, + }, + }, + { + in: "build_num-1.1.1.1.1-2a-cp3-cp3m-manylinux1_i686.whl", + out: &WheelInfo{ + Name: "build_num", + Version: "1.1.1.1.1", + BuildTag: WheelBuildTag{ + Num: 2, + Tag: "a", + }, + Platforms: []PEP425Tag{{ + Python: "cp3", + ABI: "cp3m", + Platform: "manylinux1_i686", + }}, + }, + }, + { + in: "long_num-1.2-12341234-cp3-cp3um-manylinux1_i686.whl", + out: &WheelInfo{ + Name: "long_num", + Version: "1.2", + BuildTag: WheelBuildTag{ + Num: 12341234, + }, + Platforms: []PEP425Tag{{ + Python: "cp3", + ABI: "cp3um", + Platform: "manylinux1_i686", + }}, + }, + }, + { + in: "too_short-py3-macosx_10_6_intel.whl", + out: nil, + }, + { + in: "obvious-too-long-1.3.4-abcd--py3-none-any.whl", + out: nil, + }, + { + in: "not-a-wheel-at-all.zip", + out: nil, + }, + { + in: "badtag-1.1-ab123-cp2.cp3-cp2d.cp3d-linux_x86_64.whl", + out: nil, + }, + // Some cases that are invalid are quite hard to distinguish. + { + in: "too-long-1.2.3-py2-none-win_amd64.whl", + out: &WheelInfo{ + Name: "too", + Version: "long", + BuildTag: struct { + Num int + Tag string + }{ + Num: 1, + Tag: ".2.3", + }, + Platforms: []PEP425Tag{{ + Python: "py2", + ABI: "none", + Platform: "win_amd64", + }}, + }, + }, + } + for _, c := range cases { + if got, err := ParseWheelName(c.in); c.out == nil && err == nil { + t.Errorf("parse wheel name %q: want error, got: %+v", c.in, got) + } else if c.out != nil && err != nil { + t.Errorf("parse wheel name %q: want success, got err: %v", c.in, err) + } else if c.out != nil && !reflect.DeepEqual(c.out, got) { + t.Errorf("parse wheel name %q:\nwant: %#v\n got: %#v", c.in, c.out, got) + } + } +}