diff --git a/.gitattributes b/.gitattributes index d773688..782bf9c 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1 +1,2 @@ src/text/** linguist-vendored +src/detection/cache.bin.zstd filter=lfs diff=lfs merge=lfs -text diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 61b0894..6477e8a 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -12,9 +12,9 @@ name: CI jobs: lint: name: Lint - runs-on: ubuntu-22.04 + runs-on: ubuntu-24.04 steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - uses: dtolnay/rust-toolchain@stable with: components: "rustfmt,clippy" @@ -30,16 +30,16 @@ jobs: deny-check: name: cargo-deny check - runs-on: ubuntu-22.04 + runs-on: ubuntu-24.04 steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - uses: EmbarkStudios/cargo-deny-action@v2 msrv-check: name: Minimum Stable Rust Version Check - runs-on: ubuntu-22.04 + runs-on: ubuntu-24.04 steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - uses: dtolnay/rust-toolchain@1.85.0 - run: cargo fetch - name: cargo check @@ -49,12 +49,21 @@ jobs: name: Test strategy: matrix: - os: [ubuntu-22.04] + os: [ubuntu-24.04] runs-on: ${{ matrix.os }} steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 + with: + lfs: true + - run: git lfs checkout - uses: dtolnay/rust-toolchain@stable - run: cargo fetch - name: cargo build run: cargo build --tests --all-features - run: cargo test --all-features + + test_success: + runs-on: ubuntu-24.04 + needs: [lint, test, deny-check, msrv-check] + steps: + - run: echo "All test jobs passed" diff --git a/CHANGELOG.md b/CHANGELOG.md index 69a578b..d3be890 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] - ReleaseDate +### Added +- [PR#84](https://github.com/EmbarkStudios/spdx/pull/84) resolved [#67](https://github.com/EmbarkStudios/spdx/issues/67) by inling the `askalono` crate to allow detection of license texts or headers from arbitrary text data. There are multiple features flags associated with this new feature. + ## [0.12.0] - 2025-08-19 ### Added - [PR#81](https://github.com/EmbarkStudios/spdx/pull/81) resolved [#68](https://github.com/EmbarkStudios/spdx/issues/68) by adding support for the ` WITH [%s"DocumentRef-"(idstring)":"]%s"AdditionRef-"(idstring)` syntax. Thanks [@weihanglo](https://github.com/weihanglo)! diff --git a/Cargo.lock b/Cargo.lock index 647bf30..94f0a7b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,15 @@ # It is not intended for manual editing. version = 4 +[[package]] +name = "aho-corasick" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" +dependencies = [ + "memchr", +] + [[package]] name = "bstr" version = "1.11.0" @@ -13,6 +22,18 @@ dependencies = [ "serde", ] +[[package]] +name = "cc" +version = "1.2.48" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c481bdbf0ed3b892f6f806287d72acd515b352a4ec27a208489b8c1bc839633a" +dependencies = [ + "find-msvc-tools", + "jobserver", + "libc", + "shlex", +] + [[package]] name = "console" version = "0.15.8" @@ -25,12 +46,58 @@ dependencies = [ "windows-sys", ] +[[package]] +name = "crossbeam-deque" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + +[[package]] +name = "either" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" + [[package]] name = "encode_unicode" version = "0.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f" +[[package]] +name = "find-msvc-tools" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a3076410a55c90011c298b04d0cfa770b00fa04e1e3c97d3f6c9de105a03844" + +[[package]] +name = "jobserver" +version = "0.1.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48d1dbcbbeb6a7fec7e059840aa538bd62aaccf972c7346c4d9d2059312853d0" +dependencies = [ + "libc", +] + [[package]] name = "lazy_static" version = "1.5.0" @@ -49,6 +116,12 @@ version = "2.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" +[[package]] +name = "pkg-config" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" + [[package]] name = "proc-macro2" version = "1.0.89" @@ -67,11 +140,54 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "rayon" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "368f01d005bf8fd9b1206fb6fa653e6c4a81ceb1466406b81792d87c5677a58f" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + +[[package]] +name = "regex" +version = "1.12.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "843bc0191f75f3e22651ae5f1e72939ab2f72a4bc30fa80a066bd66edefc24d4" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + [[package]] name = "regex-automata" -version = "0.4.9" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5276caf25ac86c8d810222b3dbb938e512c55c6831a10f3e6ed1c93b84041f1c" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" +checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58" [[package]] name = "serde" @@ -93,6 +209,12 @@ dependencies = [ "syn", ] +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + [[package]] name = "similar" version = "2.6.0" @@ -123,8 +245,12 @@ checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" name = "spdx" version = "0.12.0" dependencies = [ + "rayon", + "regex", "similar-asserts", "smallvec", + "unicode-normalization", + "zstd", ] [[package]] @@ -138,12 +264,36 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "tinyvec" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa5fdc3bce6191a1dbc8c02d5c8bffcf557bafa17c124c5264a458f1b0613fa" +dependencies = [ + "tinyvec_macros", +] + +[[package]] +name = "tinyvec_macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" + [[package]] name = "unicode-ident" version = "1.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe" +[[package]] +name = "unicode-normalization" +version = "0.1.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fd4f6878c9cb28d874b009da9e8d183b5abc80117c40bbd187a1fde336be6e8" +dependencies = [ + "tinyvec", +] + [[package]] name = "unicode-segmentation" version = "1.12.0" @@ -222,3 +372,31 @@ name = "windows_x86_64_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + +[[package]] +name = "zstd" +version = "0.13.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e91ee311a569c327171651566e07972200e76fcfe2242a4fa446149a3881c08a" +dependencies = [ + "zstd-safe", +] + +[[package]] +name = "zstd-safe" +version = "7.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f49c4d5f0abb602a93fb8736af2a4f4dd9512e36f7f570d66e65ff867ed3b9d" +dependencies = [ + "zstd-sys", +] + +[[package]] +name = "zstd-sys" +version = "2.0.16+zstd.1.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91e19ebc2adc8f83e43039e79776e3fda8ca919132d68a1fed6a5faca2683748" +dependencies = [ + "cc", + "pkg-config", +] diff --git a/Cargo.toml b/Cargo.toml index 659f306..c491b05 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -26,10 +26,23 @@ include = [ [features] # Includes the full canonical text of each license text = [] +# Allows analysis of text to determine if it might be an SPDX license text +detection = ["regex", "unicode-normalization"] +# Allows de/serialization of a spdx::detection::Store for quicker loading +detection-cache = ["detection", "zstd"] +# Inlines a cache into this crate, which contains all of the licenses from the +# SPDX crate that the crate version was packaged with +detection-inline-cache = ["detection-cache"] +# Performs license detection in parallel within the same text +detection-parallel = ["detection", "rayon"] [dependencies] +rayon = { version = "1.11", optional = true } +regex = { version = "1.12", optional = true } # In most cases expressions are quite small so we can avoid heap allocations smallvec = "1.15" +unicode-normalization = { version = "0.1", optional = true } +zstd = { version = "0.13", optional = true } [dev-dependencies] # Used to print colored diffs in case of test failures diff --git a/README.md b/README.md index 3842bed..5ee09c1 100644 --- a/README.md +++ b/README.md @@ -17,39 +17,49 @@ +## About + +This crate's main purpose is to parse and evaluate SPDX license expressions. It also optionally provides the ability to scan text data for SPDX license information. Each version of this crate contains a specific version of the official [SPDX license list](https://spdx.org/licenses/) which can be retrieved via the `spdx::identifiers::VERSION` constant. + +## Features + +- `text` - Includes the full canonical text of each license +- `detection` - Allows analysis of text to determine if it might be an SPDX license text, or have an SPDX license header +- `detection-cache` - Allows de/serialization of a `Store` for quicker loading +- `detection-inline-cache` - Inlines a `Store` cache into this crate, which allows easier loading in downstream crates at the cost of increased binary size +- `detection-parallel` - Performs license detection in parallel within the same text + ## Usage ```rust use spdx::Expression; -fn main() { - let this_is_fine = Expression::parse("MIT OR Apache-2.0").unwrap(); - - assert!(this_is_fine.evaluate(|req| { - if let spdx::LicenseItem::Spdx { id, .. } = req.license { - // Both MIT and Apache-2.0 are OSI approved, so this expression - // evaluates to true - return id.is_osi_approved(); - } - - false - })); - - assert!(!this_is_fine.evaluate(|req| { - if let spdx::LicenseItem::Spdx { id, .. } = req.license { - // This is saying we don't accept any licenses that are OSI approved - // so the expression will evaluate to false as both sides of the OR - // are now rejected - return !id.is_osi_approved(); - } - - false - })); - - // `NOPE` is not a valid SPDX license identifier, so this expression - // will fail to parse - let _this_is_not = Expression::parse("MIT OR NOPE").unwrap_err(); -} +let this_is_fine = Expression::parse("MIT OR Apache-2.0").unwrap(); + +assert!(this_is_fine.evaluate(|req| { + if let spdx::LicenseItem::Spdx { id, .. } = req.license { + // Both MIT and Apache-2.0 are OSI approved, so this expression + // evaluates to true + return id.is_osi_approved(); + } + + false +})); + +assert!(!this_is_fine.evaluate(|req| { + if let spdx::LicenseItem::Spdx { id, .. } = req.license { + // This is saying we don't accept any licenses that are OSI approved + // so the expression will evaluate to false as both sides of the OR + // are now rejected + return !id.is_osi_approved(); + } + + false +})); + +// `NOPE` is not a valid SPDX license identifier, so this expression +// will fail to parse +let _this_is_not = Expression::parse("MIT OR NOPE").unwrap_err(); ``` ## Updating SPDX list diff --git a/src/detection.rs b/src/detection.rs new file mode 100644 index 0000000..16b3673 --- /dev/null +++ b/src/detection.rs @@ -0,0 +1,191 @@ +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! This module is basically an inling of [askalono](https://github.com/jpeddicord/askalono) +//! +//! Askalono is not really maintained and also depends on other unmaintained +//! crates, since this crate is used by both cargo-deny and cargo-about in +//! conjunction with askalono for checking licenses, I'm pulling it directly into +//! this crate just to avoid all of the external dependencies + +use std::collections::HashMap; + +#[cfg(feature = "detection-cache")] +mod cache; +mod detect; +#[cfg(feature = "detection-inline-cache")] +mod inline_cache; +mod license; +pub use license::{LicenseType, TextData}; +mod ngram; +mod preproc; +/// Contains utilities for scanning texts for license information +pub mod scan; + +/// An entry in a [`Store`] +pub struct LicenseEntry { + /// The original license text + pub original: TextData, + /// Set of license identifiers that are aliases (ie. same license text) as + /// this entry + pub aliases: Vec, + /// Set of headers that can be used to specify this license applies to a larger file + pub headers: Vec, + /// Similar license texts that will also be scored as this license if detected + pub alternates: Vec, +} + +impl LicenseEntry { + /// Creates a new [`Self`] with the specified text + pub fn new(original: TextData) -> Self { + Self { + original, + aliases: Vec::new(), + alternates: Vec::new(), + headers: Vec::new(), + } + } +} + +/// A representation of a collection of known licenses. +/// +/// This struct is generally what you want to start with if you're looking to +/// match text against a database of licenses. Load a cache from disk using +/// `from_cache`, then use the `analyze` function to determine what a text most +/// closely matches. +#[derive(Default)] +pub struct Store { + pub(crate) licenses: HashMap, +} + +impl Store { + /// Create a new `Store`. + /// + /// More often, you probably want to use `from_cache` instead of creating + /// an empty store. + pub fn new() -> Self { + Self { + licenses: HashMap::new(), + } + } + + /// Get the number of licenses in the store. + /// + /// This only counts licenses by name -- headers, aliases, and alternates + /// aren't included in the count. + #[inline] + pub fn len(&self) -> usize { + self.licenses.len() + } + + /// Check if the store is empty. + #[inline] + pub fn is_empty(&self) -> bool { + self.licenses.is_empty() + } + + /// Get all licenses by name via iterator. + #[inline] + pub fn licenses(&self) -> impl Iterator { + self.licenses.keys() + } + + /// Get a license's standard `TextData` by name. + #[inline] + pub fn get_original(&self, name: &str) -> Option<&TextData> { + self.licenses.get(name).map(|le| &le.original) + } + + /// Add a single license to the store. + /// + /// If the license with the given name already existed, it and all of its + /// variants will be replaced. + #[inline] + pub fn add_license(&mut self, name: String, data: TextData) { + let entry = LicenseEntry::new(data); + self.licenses.insert(name, entry); + } + + /// Inserts a full `LicenseEntry` + #[inline] + pub fn insert_entry(&mut self, name: String, entry: LicenseEntry) { + self.licenses.insert(name, entry); + } + + /// Gets an iterator over all of the licenses + #[inline] + pub fn iter(&self) -> std::collections::hash_map::Iter<'_, String, LicenseEntry> { + self.licenses.iter() + } + + /// Add a variant (a header or alternate formatting) of a given license to + /// the store. + /// + /// The license must already exist. This function cannot be used to replace + /// the original/canonical text of the license. + #[inline] + pub fn add_variant( + &mut self, + name: &str, + variant: LicenseType, + data: TextData, + ) -> Result<(), StoreError> { + let entry = self + .licenses + .get_mut(name) + .ok_or(StoreError::UnknownLicense)?; + + match variant { + LicenseType::Alternate => { + entry.alternates.push(data); + } + LicenseType::Header => { + entry.headers.push(data); + } + LicenseType::Original => { + return Err(StoreError::OriginalInvalidForVariant); + } + } + + Ok(()) + } + + /// Get the list of aliases for a given license. + #[inline] + pub fn aliases(&self, name: &str) -> Option<&Vec> { + self.licenses.get(name).map(|le| &le.aliases) + } + + /// Set the list of aliases for a given license. + #[inline] + pub fn set_aliases(&mut self, name: &str, aliases: Vec) -> Result<(), StoreError> { + let entry = self + .licenses + .get_mut(name) + .ok_or(StoreError::UnknownLicense)?; + entry.aliases = aliases; + Ok(()) + } +} + +/// The errors that can occur when interacting with a [`Store`] +#[derive(Copy, Clone, PartialEq, Debug)] +pub enum StoreError { + /// The license name was not in the Store + UnknownLicense, + /// Attempted to call `Store::add_variant` with `LicenseType::Original` + OriginalInvalidForVariant, +} + +impl std::fmt::Display for StoreError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::UnknownLicense => f.write_str("specified license did not exist in the store"), + Self::OriginalInvalidForVariant => { + f.write_str("attempted to add an original license text as a variant") + } + } + } +} + +impl std::error::Error for StoreError {} diff --git a/src/detection/cache.bin.zstd b/src/detection/cache.bin.zstd new file mode 100644 index 0000000..76b2611 --- /dev/null +++ b/src/detection/cache.bin.zstd @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3443c72f3d1bbd769c78fc1bba82af691eb7e34c6e04adc61a86a2a03ae97805 +size 2018306 diff --git a/src/detection/cache.rs b/src/detection/cache.rs new file mode 100644 index 0000000..fb4ac38 --- /dev/null +++ b/src/detection/cache.rs @@ -0,0 +1,384 @@ +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use crate::detection::{LicenseEntry, Store, license::TextData, ngram::NgramSet}; +use std::io; + +const CACHE_VERSION: &str = "spdx-crate-01"; + +#[derive(Debug)] +pub enum CacheError { + Io(io::Error), + InvalidVersion { + actual: String, + expected: &'static str, + }, + Proto(ProtoError), +} + +impl std::fmt::Display for CacheError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Io(io) => write!(f, "{io}"), + Self::Proto(p) => write!(f, "{p}"), + Self::InvalidVersion { actual, expected } => { + write!(f, "expected version {expected}, but got version {actual}") + } + } + } +} + +impl std::error::Error for CacheError { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + match self { + Self::Io(io) => Some(io), + Self::Proto(p) => Some(p), + Self::InvalidVersion { .. } => None, + } + } +} + +impl From for CacheError { + fn from(b: BinErr) -> Self { + match b { + BinErr::Io(i) => Self::Io(i), + BinErr::Proto(p) => Self::Proto(p), + } + } +} + +impl From for CacheError { + fn from(e: io::Error) -> Self { + Self::Io(e) + } +} + +impl Store { + /// Create a store from a cache file. + /// + /// This method is highly useful for quickly loading a cache, as creating + /// one from text data is rather slow. This method can typically load + /// the full SPDX set from disk in < 100ms. + /// + /// The cache contains a simple version header that ensure that the cache + /// is loadable + pub fn from_cache(mut readable: R) -> Result + where + R: io::Read + Sized, + { + let mut header = [0u8; 13]; + readable.read_exact(&mut header)?; + + if header != CACHE_VERSION.as_bytes() { + return Err(CacheError::InvalidVersion { + actual: String::from_utf8_lossy(&header).into_owned(), + expected: CACHE_VERSION, + }); + } + + let mut dec = zstd::Decoder::new(readable)?; + Ok(Self::bread(&mut dec)?) + } + + /// Serialize the current store. + pub fn to_cache(&self, mut writable: W) -> Result<(), CacheError> + where + W: io::Write + Sized, + { + writable.write_all(CACHE_VERSION.as_bytes())?; + + let mut enc = zstd::Encoder::new(writable, 21)?; + self.bwrite(&mut enc)?; + enc.finish()?; + + Ok(()) + } +} + +#[derive(Debug)] +pub enum ProtoError { + TooLong(usize), + Utf8(std::string::FromUtf8Error), +} + +impl std::fmt::Display for ProtoError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::TooLong(tl) => write!(f, "{tl:016x} is too large to fit in a u16"), + Self::Utf8(u) => write!(f, "{u}"), + } + } +} + +impl std::error::Error for ProtoError { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + if let Self::Utf8(u) = self { + Some(u) + } else { + None + } + } +} + +enum BinErr { + Io(io::Error), + Proto(ProtoError), +} + +impl From for BinErr { + fn from(e: io::Error) -> Self { + Self::Io(e) + } +} + +impl From for BinErr { + fn from(e: ProtoError) -> Self { + Self::Proto(e) + } +} + +#[inline] +fn write_u16(u: usize, w: &mut W) -> Result<(), BinErr> +where + W: io::Write + Sized, +{ + let u: u16 = u.try_into().map_err(|_e| ProtoError::TooLong(u))?; + w.write_all(&u.to_le_bytes()).map_err(BinErr::Io) +} + +#[inline] +fn read_u16(r: &mut R) -> Result +where + R: io::Read + Sized, +{ + let mut u = [0u8; 2]; + r.read_exact(&mut u)?; + Ok(u16::from_le_bytes(u) as usize) +} + +#[inline] +fn write_u64(u: usize, w: &mut W) -> Result<(), BinErr> +where + W: io::Write + Sized, +{ + w.write_all(&(u as u64).to_le_bytes()).map_err(BinErr::Io) +} + +#[inline] +fn read_u64(r: &mut R) -> Result +where + R: io::Read + Sized, +{ + let mut b = [0u8; 8]; + r.read_exact(&mut b)?; + Ok(u64::from_le_bytes(b) as usize) +} + +impl Bin for String { + fn bwrite(&self, w: &mut W) -> Result<(), BinErr> + where + W: io::Write + Sized, + { + write_u16(self.len(), w)?; + w.write_all(self.as_bytes()).map_err(BinErr::Io) + } + + fn bread(r: &mut R) -> Result + where + R: io::Read + Sized, + { + let mut len = read_u16(r)?; + let mut pos = 0; + let mut s = vec![0; len]; + + while len > 0 { + let read = r.read(&mut s[pos..])?; + pos += read; + len -= read; + } + + Ok(String::from_utf8(s).map_err(ProtoError::Utf8)?) + } +} + +#[inline] +fn write_vec(v: &[B], w: &mut W) -> Result<(), BinErr> +where + W: io::Write + Sized, + B: Bin, +{ + write_u16(v.len(), w)?; + + for b in v { + b.bwrite(w)?; + } + + Ok(()) +} + +#[inline] +fn read_vec(r: &mut R) -> Result, BinErr> +where + R: io::Read + Sized, + B: Bin, +{ + let len = read_u16(r)?; + + let mut v = Vec::with_capacity(len); + + for _ in 0..len { + v.push(B::bread(r)?); + } + + Ok(v) +} + +trait Bin: Sized { + fn bwrite(&self, w: &mut W) -> Result<(), BinErr> + where + W: io::Write + Sized; + fn bread(r: &mut R) -> Result + where + R: io::Read + Sized; +} + +impl Bin for Store { + fn bwrite(&self, w: &mut W) -> Result<(), BinErr> + where + W: io::Write + Sized, + { + write_u16(self.licenses.len(), w)?; + + for (k, v) in &self.licenses { + k.bwrite(w)?; + v.bwrite(w)?; + } + + Ok(()) + } + + fn bread(r: &mut R) -> Result + where + R: io::Read + Sized, + { + let map_count = read_u16(r)?; + + let mut licenses = std::collections::HashMap::new(); + + for _ in 0..map_count { + let key = String::bread(r)?; + let value = LicenseEntry::bread(r)?; + + licenses.insert(key, value); + } + + Ok(Self { licenses }) + } +} + +impl Bin for LicenseEntry { + fn bwrite(&self, w: &mut W) -> Result<(), BinErr> + where + W: io::Write + Sized, + { + self.original.bwrite(w)?; + write_vec(&self.aliases, w)?; + write_vec(&self.headers, w)?; + write_vec(&self.alternates, w)?; + + Ok(()) + } + + fn bread(r: &mut R) -> Result + where + R: io::Read + Sized, + { + Ok(Self { + original: TextData::bread(r)?, + aliases: read_vec(r)?, + headers: read_vec(r)?, + alternates: read_vec(r)?, + }) + } +} + +impl Bin for TextData { + fn bwrite(&self, w: &mut W) -> Result<(), BinErr> + where + W: io::Write + Sized, + { + self.match_data.bwrite(w)?; + write_u64(self.lines_view.0, w)?; + write_u64(self.lines_view.1, w)?; + write_vec(&self.lines_normalized, w)?; + self.text_processed.bwrite(w)?; + + Ok(()) + } + + fn bread(r: &mut R) -> Result + where + R: io::Read + Sized, + { + Ok(Self { + match_data: NgramSet::bread(r)?, + lines_view: (read_u64(r)?, read_u64(r)?), + lines_normalized: read_vec(r)?, + text_processed: String::bread(r)?, + }) + } +} + +impl Bin for u32 { + fn bwrite(&self, w: &mut W) -> Result<(), BinErr> + where + W: io::Write + Sized, + { + w.write_all(&self.to_le_bytes()).map_err(BinErr::Io) + } + + fn bread(r: &mut R) -> Result + where + R: io::Read + Sized, + { + let mut b = [0; 4]; + r.read_exact(&mut b)?; + Ok(u32::from_le_bytes(b)) + } +} + +impl Bin for NgramSet { + fn bwrite(&self, w: &mut W) -> Result<(), BinErr> + where + W: io::Write + Sized, + { + write_u16(self.map.len(), w)?; + for (k, v) in &self.map { + k.bwrite(w)?; + v.bwrite(w)?; + } + w.write_all(&[self.n])?; + write_u64(self.size, w)?; + + Ok(()) + } + + fn bread(r: &mut R) -> Result + where + R: io::Read + Sized, + { + let map_len = read_u16(r)?; + let mut map = std::collections::HashMap::new(); + for _ in 0..map_len { + let k = String::bread(r)?; + let v = u32::bread(r)?; + + map.insert(k, v); + } + let mut n = [0; 1]; + r.read_exact(&mut n)?; + let size = read_u64(r)?; + + Ok(Self { map, n: n[0], size }) + } +} diff --git a/src/detection/detect.rs b/src/detection/detect.rs new file mode 100644 index 0000000..c934235 --- /dev/null +++ b/src/detection/detect.rs @@ -0,0 +1,141 @@ +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use std::{cmp::Ordering, fmt}; + +use crate::detection::{ + license::LicenseType, + license::TextData, + {LicenseEntry, Store}, +}; + +/// Information about text that was compared against licenses in the store. +/// +/// This only contains information about the overall match; to uncover more +/// data you can run methods like `optimize_bounds` on `TextData`. +/// +/// Its lifetime is tied to the lifetime of the `Store` it was generated from. +#[derive(Clone)] +pub struct Match<'a> { + /// Confidence score of the match, ranging from 0 to 1. + pub score: f32, + /// The name of the closest matching license in the `Store`. This will + /// always be something that exists in the store, regardless of the score. + pub name: &'a str, + /// The type of the license that matched. Useful to know if the match was + /// the complete text, a header, or something else. + pub license_type: LicenseType, + /// A reference to the license data that matched inside the `Store`. May be + /// useful for diagnostic purposes or to further optimize the result. + pub data: &'a TextData, +} + +/// A lighter version of Match to be used during analysis. +/// Reduces the need for cloning a bunch of fields. +struct PartialMatch<'a> { + pub name: &'a str, + pub score: f32, + pub license_type: LicenseType, + pub data: &'a TextData, +} + +impl<'a> PartialOrd for PartialMatch<'a> { + fn partial_cmp(&self, other: &PartialMatch<'_>) -> Option { + self.score.partial_cmp(&other.score) + } +} + +impl<'a> PartialEq for PartialMatch<'a> { + fn eq(&self, other: &PartialMatch<'_>) -> bool { + self.score.eq(&other.score) + && self.name == other.name + && self.license_type == other.license_type + } +} + +impl<'a> fmt::Debug for Match<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + f, + "Match {{ score: {}, name: {}, license_type: {:?} }}", + self.score, self.name, self.license_type + ) + } +} + +impl Store { + /// Compare the given `TextData` against all licenses in the `Store`. + /// + /// This parallelizes the search as much as it can to find the best match. + /// Once a match is obtained, it can be optimized further; see methods on + /// `TextData` for more information. + pub fn analyze<'a>(&'a self, text: &TextData) -> Match<'a> { + let mut res: Vec>; + + let analyze_fold = + |mut acc: Vec>, (name, data): (&'a String, &'a LicenseEntry)| { + acc.push(PartialMatch { + score: data.original.match_score(text), + name, + license_type: LicenseType::Original, + data: &data.original, + }); + data.alternates.iter().for_each(|alt| { + acc.push(PartialMatch { + score: alt.match_score(text), + name, + license_type: LicenseType::Alternate, + data: alt, + }); + }); + data.headers.iter().for_each(|head| { + acc.push(PartialMatch { + score: head.match_score(text), + name, + license_type: LicenseType::Header, + data: head, + }); + }); + + acc + }; + + // parallel analysis + #[cfg(feature = "detection-parallel")] + { + use rayon::prelude::*; + res = self + .licenses + .par_iter() + .fold(Vec::new, analyze_fold) + .reduce( + Vec::new, + |mut a: Vec>, b: Vec>| { + a.extend(b); + a + }, + ); + res.par_sort_unstable_by(|a, b| b.partial_cmp(a).unwrap()); + } + + // single-threaded analysis + #[cfg(not(feature = "detection-parallel"))] + { + res = self + .licenses + .iter() + // len of licenses isn't strictly correct, but it'll do + .fold(Vec::with_capacity(self.licenses.len()), analyze_fold); + res.sort_unstable_by(|a, b| b.partial_cmp(a).unwrap()); + } + + let m = &res[0]; + + Match { + score: m.score, + name: m.name, + license_type: m.license_type, + data: m.data, + } + } +} diff --git a/src/detection/inline_cache.rs b/src/detection/inline_cache.rs new file mode 100644 index 0000000..07c3bf4 --- /dev/null +++ b/src/detection/inline_cache.rs @@ -0,0 +1,9 @@ +const CACHE: &[u8] = include_bytes!("cache.bin.zstd"); + +impl crate::detection::Store { + /// Attempts to load the cached store inlined into this crate's source + #[inline] + pub fn load_inline() -> Result { + Self::from_cache(CACHE) + } +} diff --git a/src/detection/license.rs b/src/detection/license.rs new file mode 100644 index 0000000..795e9ae --- /dev/null +++ b/src/detection/license.rs @@ -0,0 +1,389 @@ +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use std::{collections::HashMap, fmt}; + +use crate::detection::{ + ngram::NgramSet, + preproc::{apply_aggressive, apply_normalizers}, +}; + +/// The type of a license entry (typically in a `Store`). +#[derive(Clone, Copy, PartialEq, Debug)] +pub enum LicenseType { + /// The canonical text of the license. + Original, + /// A license header. There may be more than one in a `Store`. + Header, + /// An alternate form of a license. This is intended to be used for + /// alternate _formats_ of a license, not for variants where the text has + /// different meaning. Not currently used in askalono's SPDX dataset. + Alternate, +} + +impl fmt::Display for LicenseType { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + f, + "{}", + match *self { + LicenseType::Original => "original text", + LicenseType::Header => "license header", + LicenseType::Alternate => "alternate text", + } + ) + } +} + +/// A structure representing compiled text/matching data. +/// +/// This is the key structure used to compare two texts against one another. It +/// handles pre-processing the text to n-grams, scoring, and optimizing the +/// result to try to identify specific details about a match. +/// +/// # Examples +/// +/// Basic scoring of two texts: +/// +/// ``` +/// use spdx::detection::TextData; +/// +/// let license = TextData::from("My First License"); +/// let sample = TextData::from("copyright 20xx me irl\n\n // my first license"); +/// assert_eq!(sample.match_score(&license), 1.0); +/// ``` +/// +/// The above example is a perfect match, as identifiable copyright statements +/// are stripped out during pre-processing. +/// +/// Building on that, `TextData` is able to tell you _where_ in the text a +/// license is located: +/// +/// ``` +/// # use std::error::Error; +/// # use spdx::detection::TextData; +/// # fn main() -> Result<(), Box> { +/// # let license = TextData::from("My First License"); +/// let sample = TextData::from("copyright 20xx me irl\n// My First License\nfn hello() {\n ..."); +/// let (optimized, score) = sample.optimize_bounds(&license); +/// assert_eq!((1, 2), optimized.lines_view()); +/// assert!(score > 0.99f32, "license within text matches"); +/// # Ok(()) +/// # } +/// ``` +#[derive(Clone, Debug)] +pub struct TextData { + pub(crate) match_data: NgramSet, + pub(crate) lines_view: (usize, usize), + pub(crate) lines_normalized: Vec, + pub(crate) text_processed: String, +} + +impl TextData { + /// Create a new `TextData` structure from a string. + /// + /// The given text will be normalized, then smashed down into n-grams for + /// matching. By default, the normalized text is stored inside the + /// structure for future diagnostics. This is necessary for optimizing a + /// match and for diffing against other texts. If you don't want this extra + /// data, you can call `without_text` throw it out. Generally, as a user of + /// this library you want to keep the text data, but askalono will throw it + /// away in its own `Store` as it's not needed. + pub fn new(text: &str) -> Self { + let lines_normalized = apply_normalizers(text); + let normalized_joined = lines_normalized.join("\n"); + let text_processed = apply_aggressive(&normalized_joined); + let match_data = NgramSet::from_str(&text_processed, 2); + + Self { + match_data, + lines_view: (0, lines_normalized.len()), + lines_normalized, + text_processed, + } + } + + /// Consume this `TextData`, returning one without normalized/processed + /// text stored. + /// + /// Unless you know you don't want the text, you probably don't want to use + /// this. Other methods on `TextData` require that text is present. + pub fn without_text(self) -> Self { + Self { + match_data: self.match_data, + lines_view: (0, 0), + lines_normalized: Vec::new(), + text_processed: String::new(), + } + } + + /// Get the bounds of the active line view. + /// + /// This represents the "active" region of lines that matches are generated + /// from. The bounds are a 0-indexed `(start, end)` tuple, with inclusive + /// start and exclusive end indicies. See `optimize_bounds`. + /// + /// This is largely for informational purposes; other methods in + /// `TextView`, such as `lines` and `match_score`, will already account for + /// the line range. However, it's useful to call it after running + /// `optimize_bounds` to discover where the input text was discovered. + pub fn lines_view(&self) -> (usize, usize) { + self.lines_view + } + + /// Clone this `TextView`, creating a copy with the given view. + /// + /// This will re-generate match data for the given view. It's used in + /// `optimize_bounds` to shrink/expand the view of the text to discover + /// bounds. + /// + /// Other methods on `TextView` respect this boundary, so it's not needed + /// outside this struct. + pub fn with_view(&self, start: usize, end: usize) -> Self { + let view = &self.lines_normalized[start..end]; + let view_joined = view.join("\n"); + let text_processed = apply_aggressive(&view_joined); + + Self { + match_data: NgramSet::from_str(&text_processed, 2), + lines_view: (start, end), + lines_normalized: self.lines_normalized.clone(), + text_processed, + } + } + + /// "Erase" the current lines in view and restore the view to its original + /// bounds. + /// + /// For example, consider a file with two licenses in it. One was identified + /// (and located) with `optimize_bounds`. Now you want to find the other: + /// white-out the matched lines, and re-run the overall search to find a + /// new high score. + pub fn white_out(&self) -> Self { + // note that we're not using the view here... + let lines = &self.lines_normalized; + + // ...because it's used here to exclude lines + let new_normalized: Vec = lines + .iter() + .enumerate() + .map(|(i, line)| { + if i >= self.lines_view.0 && i < self.lines_view.1 { + "".to_string() + } else { + line.clone() + } + }) + .collect(); + + let text_processed = apply_aggressive(&new_normalized.join("\n")); + Self { + match_data: NgramSet::from_str(&text_processed, 2), + lines_view: (0, new_normalized.len()), + lines_normalized: new_normalized, + text_processed, + } + } + + /// Get a slice of the normalized lines in this `TextData`. + pub fn lines(&self) -> &[String] { + &self.lines_normalized[self.lines_view.0..self.lines_view.1] + } + + /// Compare this `TextData` with another, returning a similarity score. + /// + /// This is what's used during analysis to rank licenses. + pub fn match_score(&self, other: &Self) -> f32 { + self.match_data.dice(&other.match_data) + } + + /// Determines if this [`TextData`] is equal to another + #[inline] + pub fn ngram_matches(&self, other: &Self) -> bool { + self.match_data.eq(&other.match_data) + } + + /// Attempt to optimize a known match to locate possible line ranges. + /// + /// Returns a new `TextData` struct and a score. The returned struct is a + /// clone of `self`, with its view set to the best match against `other`. + /// + /// This will respect any views set on the `TextData` (an optimized result + /// won't go outside the original view). + /// + /// Note that this won't be 100% optimal if there are blank lines + /// surrounding the actual match, since successive blank lines in a range + /// will likely have the same score. + /// + /// You should check the value of `lines_view` on the returned struct to + /// find the line ranges. + pub fn optimize_bounds(&self, other: &Self) -> (Self, f32) { + let view = self.lines_view; + + // optimize the ending bounds of the text match + let (end_optimized, _) = self.search_optimize( + &|end| self.with_view(view.0, end).match_score(other), + &|end| self.with_view(view.0, end), + ); + let new_end = end_optimized.lines_view.1; + + // then optimize the starting bounds + let (optimized, score) = end_optimized.search_optimize( + &|start| end_optimized.with_view(start, new_end).match_score(other), + &|start| end_optimized.with_view(start, new_end), + ); + (optimized, score) + } + + fn search_optimize( + &self, + score: &dyn Fn(usize) -> f32, + value: &dyn Fn(usize) -> Self, + ) -> (Self, f32) { + // cache score checks, since they're kinda expensive + let mut memo: HashMap = HashMap::new(); + let mut check_score = + |index: usize| -> f32 { *memo.entry(index).or_insert_with(|| score(index)) }; + + fn search(score: &mut dyn FnMut(usize) -> f32, left: usize, right: usize) -> (usize, f32) { + if right - left <= 3 { + // find the index of the highest score in the remaining items + return (left..=right) + .map(|x| (x, score(x))) + .fold((0usize, 0f32), |acc, x| if x.1 >= acc.1 { x } else { acc }); + } + + let low = (left * 2 + right) / 3; + let high = (left + right * 2) / 3; + let score_low = score(low); + let score_high = score(high); + + if score_low > score_high { + search(score, left, high - 1) + } else { + search(score, low + 1, right) + } + } + + let optimal = search(&mut check_score, self.lines_view.0, self.lines_view.1); + (value(optimal.0), optimal.1) + } +} + +impl<'a> From<&'a str> for TextData { + fn from(text: &'a str) -> Self { + Self::new(text) + } +} + +impl From for TextData { + fn from(text: String) -> Self { + Self::new(&text) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn optimize_bounds() { + let license_text = "this is a license text\nor it pretends to be one\nit's just a test"; + let sample_text = "this is a license text\nor it pretends to be one\nit's just a test\nwords\n\nhere is some\ncode\nhello();\n\n//a comment too"; + let license = TextData::from(license_text).without_text(); + let sample = TextData::from(sample_text); + + let (optimized, _) = sample.optimize_bounds(&license); + println!("{:?}", optimized.lines_view); + println!("{:?}", optimized.lines_normalized); + assert_eq!((0, 3), optimized.lines_view); + + // add more to the string, try again (avoid int trunc screwups) + let sample_text = format!("{}\none more line", sample_text); + let sample = TextData::from(sample_text.as_str()); + let (optimized, _) = sample.optimize_bounds(&license); + println!("{:?}", optimized.lines_view); + println!("{:?}", optimized.lines_normalized); + assert_eq!((0, 3), optimized.lines_view); + + // add to the beginning too + let sample_text = format!("some content\nat\n\nthe beginning\n{}", sample_text); + let sample = TextData::from(sample_text.as_str()); + let (optimized, _) = sample.optimize_bounds(&license); + println!("{:?}", optimized.lines_view); + println!("{:?}", optimized.lines_normalized); + // end bounds at 7 and 8 have the same score, since they're empty lines (not + // counted). askalono is not smart enough to trim this as close as it + // can. + assert!( + (4, 7) == optimized.lines_view || (4, 8) == optimized.lines_view, + "bounds are (4, 7) or (4, 8)" + ); + } + + // if a view is set on the text data, optimize_bounds must not find text + // outside of that range + #[test] + fn optimize_doesnt_grow_view() { + let sample_text = "0\n1\n2\naaa aaa\naaa\naaa\naaa\n7\n8"; + let license_text = "aaa aaa aaa aaa aaa"; + let sample = TextData::from(sample_text); + let license = TextData::from(license_text).without_text(); + + // sanity: the optimized bounds should be at (3, 7) + let (optimized, _) = sample.optimize_bounds(&license); + assert_eq!((3, 7), optimized.lines_view); + + // this should still work + let sample = sample.with_view(3, 7); + let (optimized, _) = sample.optimize_bounds(&license); + assert_eq!((3, 7), optimized.lines_view); + + // but if we shrink the view further, it shouldn't be outside that range + let sample = sample.with_view(4, 6); + let (optimized, _) = sample.optimize_bounds(&license); + assert_eq!((4, 6), optimized.lines_view); + + // restoring the view should still be OK too + let sample = sample.with_view(0, 9); + let (optimized, _) = sample.optimize_bounds(&license); + assert_eq!((3, 7), optimized.lines_view); + } + + // ensure we don't choke on small TextData matches + #[test] + fn match_small() { + let a = TextData::from("a b"); + let b = TextData::from("a\nlong\nlicense\nfile\n\n\n\n\nabcdefg"); + + let x = a.match_score(&b); + let y = b.match_score(&a); + + assert_eq!(x, y); + } + + // don't choke on empty TextData either + #[test] + fn match_empty() { + let a = TextData::from(""); + let b = TextData::from("a\nlong\nlicense\nfile\n\n\n\n\nabcdefg"); + + let x = a.match_score(&b); + let y = b.match_score(&a); + + assert_eq!(x, y); + } + + #[test] + fn view_and_white_out() { + let a = TextData::from("aaa\nbbb\nccc\nddd"); + assert_eq!("aaa bbb ccc ddd", a.text_processed); + + let b = a.with_view(1, 3); + assert_eq!(2, b.lines().len()); + assert_eq!("bbb ccc", b.text_processed); + + let c = b.white_out(); + assert_eq!("aaa ddd", c.text_processed); + } +} diff --git a/src/detection/ngram.rs b/src/detection/ngram.rs new file mode 100644 index 0000000..c2c1929 --- /dev/null +++ b/src/detection/ngram.rs @@ -0,0 +1,173 @@ +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use std::{ + cmp::min, + collections::{HashMap, VecDeque, hash_map::Iter}, +}; + +#[derive(Clone, Debug)] +pub struct NgramSet { + pub(crate) map: HashMap, + pub(crate) n: u8, + pub(crate) size: usize, +} + +impl NgramSet { + #[inline] + pub fn new(n: u8) -> Self { + Self { + map: HashMap::new(), + n, + size: 0, + } + } + + #[inline] + pub fn from_str(s: &str, n: u8) -> Self { + let mut set = Self::new(n); + set.analyze(s); + set + } + + pub fn analyze(&mut self, s: &str) { + let words = s.split(' '); + + let mut deque: VecDeque<&str> = VecDeque::with_capacity(self.n as usize); + for w in words { + deque.push_back(w); + if deque.len() == self.n as usize { + let gram = { + let mut g = String::with_capacity( + deque.iter().map(|s| s.len()).sum::() + self.n as usize - 1, + ); + + for (i, s) in deque.iter().enumerate() { + if i > 0 { + g.push(' '); + } + + g.push_str(s); + } + + g + }; + + self.add_gram(gram); + deque.pop_front(); + } + } + } + + #[inline] + fn add_gram(&mut self, gram: String) { + let n = self.map.entry(gram).or_insert(0); + *n += 1; + self.size += 1; + } + + #[inline] + pub fn get(&self, gram: &str) -> u32 { + if let Some(count) = self.map.get(gram) { + *count + } else { + 0 + } + } + + #[inline] + pub fn len(&self) -> usize { + self.size + } + + #[inline] + pub fn is_empty(&self) -> bool { + self.size == 0 + } + + pub fn dice(&self, other: &Self) -> f32 { + // no sense comparing sets of different sizes + if other.n != self.n { + return 0f32; + } + + // there's obviously no match if either are empty strings; + // if we don't check here we could end up with NaN below + // when both are empty + if self.is_empty() || other.is_empty() { + return 0f32; + } + + // choose the smaller map to iterate + let (x, y) = if self.len() < other.len() { + (self, other) + } else { + (other, self) + }; + + let mut matches = 0; + for (gram, count) in x { + matches += min(*count, y.get(gram)); + } + + (2.0 * matches as f32) / ((self.len() + other.len()) as f32) + } +} + +impl PartialEq for NgramSet { + fn eq(&self, other: &Self) -> bool { + self.n == other.n && self.size == other.size && self.map == other.map + } +} + +impl<'a> IntoIterator for &'a NgramSet { + type Item = (&'a String, &'a u32); + type IntoIter = Iter<'a, String, u32>; + + fn into_iter(self) -> Self::IntoIter { + self.map.iter() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + // this is a pretty banal test, but it's a starting point :P + #[test] + fn can_construct() { + let set = NgramSet::new(2); + assert_eq!(set.size, 0); + assert_eq!(set.n, 2); + } + + #[test] + fn no_nan() { + let a = NgramSet::from_str("", 2); + let b = NgramSet::from_str("", 2); + + let score = a.dice(&b); + + assert!(!score.is_nan()); + } + + #[test] + fn same_size() { + let a = NgramSet::from_str("", 2); + let b = NgramSet::from_str("", 3); + + let score = a.dice(&b); + + assert_eq!(0f32, score); + } + + #[test] + fn identical() { + let a = NgramSet::from_str("one two three apple banana", 2); + let b = NgramSet::from_str("one two three apple banana", 2); + + let score = a.dice(&b); + + assert_eq!(1f32, score); + } +} diff --git a/src/detection/preproc.rs b/src/detection/preproc.rs new file mode 100644 index 0000000..1f5d27d --- /dev/null +++ b/src/detection/preproc.rs @@ -0,0 +1,426 @@ +// Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use std::{borrow::Cow, collections::HashMap, sync::LazyLock}; + +use regex::{Regex, Replacer}; +use unicode_normalization::UnicodeNormalization; + +type PreprocFn = dyn Fn(Cow<'_, str>) -> Cow<'_, str>; + +trait CowRegex { + fn replace_all_cow<'a, R: Replacer>(&self, text: Cow<'a, str>, replace: R) -> Cow<'a, str>; +} + +impl CowRegex for Regex { + fn replace_all_cow<'a, R: Replacer>(&self, text: Cow<'a, str>, replace: R) -> Cow<'a, str> { + match text { + Cow::Borrowed(find) => self.replace_all(find, replace), + Cow::Owned(find) => Cow::Owned(self.replace_all(&find, replace).into_owned()), + } + } +} + +/// A list of preprocessors that normalize text without removing anything +/// substantial. These operate on one line at a time. +pub const PREPROC_NORMALIZE: [&PreprocFn; 6] = [ + &normalize_unicode, + &remove_junk, + &blackbox_urls, + &normalize_horizontal_whitespace, + &normalize_punctuation, + &trim, +]; + +/// A list of preprocessors that more aggressively normalize/mangle text +/// to make for friendlier matching. May remove statements and lines, and +/// more heavily normalize punctuation. +pub const PREPROC_AGGRESSIVE: [&PreprocFn; 8] = [ + &remove_common_tokens, + &normalize_vertical_whitespace, + &remove_punctuation, + &lowercaseify, + &remove_title_line, + &remove_copyright_statements, + &collapse_whitespace, + &trim, +]; + +pub fn apply_normalizers(text: &str) -> Vec { + let mut lines = Vec::new(); + for line in text.split('\n') { + let mut out = Cow::from(line); + for preproc in &PREPROC_NORMALIZE { + out = preproc(out); + } + lines.push(out.into()); + } + lines +} + +pub fn apply_aggressive(text: &str) -> String { + let mut out = text.into(); + for preproc in &PREPROC_AGGRESSIVE { + out = preproc(out); + } + out.into() +} + +// Line-by-line normalizers + +fn normalize_unicode(input: Cow<'_, str>) -> Cow<'_, str> { + input.nfc().collect::().into() +} + +fn remove_junk(input: Cow<'_, str>) -> Cow<'_, str> { + static RX: LazyLock = LazyLock::new(|| Regex::new(r"[^\w\s\pP]+").unwrap()); + + RX.replace_all_cow(input, "") +} + +fn blackbox_urls(input: Cow<'_, str>) -> Cow<'_, str> { + static RX: LazyLock = LazyLock::new(|| Regex::new(r"https?://\S+").unwrap()); + + RX.replace_all_cow(input, "http://blackboxed/url") +} + +fn normalize_horizontal_whitespace(input: Cow<'_, str>) -> Cow<'_, str> { + static RX: LazyLock = + LazyLock::new(|| Regex::new(r"(?x)[ \t\p{Zs} \\ / \| \x2044 ]+").unwrap()); + + RX.replace_all_cow(input, " ") +} + +fn normalize_punctuation(input: Cow<'_, str>) -> Cow<'_, str> { + struct Rx { + quotes: Regex, + dash: Regex, + open: Regex, + close: Regex, + under: Regex, + copy: Regex, + } + static RX: LazyLock = LazyLock::new(|| Rx { + quotes: Regex::new(r#"["'\p{Pi}\p{Pf}]+"#).unwrap(), + dash: Regex::new(r"\p{Pd}+").unwrap(), + open: Regex::new(r"\p{Ps}+").unwrap(), + close: Regex::new(r"\p{Pe}+").unwrap(), + under: Regex::new(r"\p{Pc}+").unwrap(), + copy: Regex::new(r"[©Ⓒⓒ]").unwrap(), + }); + + let mut out = input; + let rx = &RX; + out = rx.quotes.replace_all_cow(out, "'"); + out = rx.dash.replace_all_cow(out, "-"); + out = rx.open.replace_all_cow(out, "("); + out = rx.close.replace_all_cow(out, ")"); + out = rx.under.replace_all_cow(out, "_"); + rx.copy.replace_all_cow(out, "(c)") +} + +fn trim(input: Cow<'_, str>) -> Cow<'_, str> { + match input { + Cow::Borrowed(text) => text.trim().into(), + Cow::Owned(text) => Cow::Owned(text.trim().to_owned()), + } +} + +// Aggressive preprocessors + +// Cut prefix of string near given byte index. +// If given index doesn't lie at char boundary, +// returns the biggest prefix with length not exceeding idx. +// If index is bigger than length or string, returns the whole string. +fn trim_byte_adjusted(s: &str, idx: usize) -> &str { + if idx >= s.len() { + return s; + } + + if let Some(sub) = s.get(..idx) { + sub + } else { + // Inspect bytes before index + let trailing_continuation = s.as_bytes()[..idx] + .iter() + .rev() + // Multibyte characters are encoded in UTF-8 in the following manner: + // first byte | rest of bytes + // 1..10xxxxx 10xxxxxx + // ^^^^ number of ones is equal to number of bytes in codepoint + // Number of 10xxxxxx bytes in codepoint is at most 3 in valid UTF-8-encoded string, + // so this loop actually runs a little iterations + .take_while(|&byte| byte & 0b1100_0000 == 0b1000_0000) + .count(); + // Subtract 1 to take the first byte in codepoint into account + &s[..idx - trailing_continuation - 1] + } +} + +fn lcs_substr<'a>(f_line: &'a str, s_line: &'a str) -> &'a str { + // find the length of common prefix in byte representations of strings + let prefix_len = f_line + .as_bytes() + .iter() + .zip(s_line.as_bytes()) + .take_while(|&(&f, &s)| f == s) + .count(); + + trim_byte_adjusted(f_line, prefix_len).trim() +} + +fn remove_common_tokens(input: Cow<'_, str>) -> Cow<'_, str> { + let mut l_iter = input.split('\n'); + + let mut prefix_counts = HashMap::<_, u32>::new(); + + // pass 1: iterate through the text to record common prefixes + if let Some(first) = l_iter.next() { + let mut pair = ("", first); + let line_pairs = std::iter::from_fn(|| { + pair = (pair.1, l_iter.next()?); + Some(pair) + }); + for (a, b) in line_pairs { + let common = lcs_substr(a, b); + + // why start at 1, then immediately add 1? + // lcs_substr compares two lines! + // this doesn't need to be exact, just consistent. + if common.len() > 3 { + *prefix_counts.entry(common).or_insert(1) += 1; + } + } + } + + // look at the most common observed prefix + let most_common = match prefix_counts.iter().max_by_key(|&(_k, v)| v) { + Some((prefix, _count)) => prefix, + None => return input, + }; + + // reconcile the count with other longer prefixes that may be stored + let common_count = prefix_counts + .iter() + .filter_map(|(s, count)| Some(count).filter(|_| s.starts_with(most_common))) + .sum::(); + + let line_count = input.split('\n').count(); + + // the common string must be at least 80% of the text + let prefix_threshold = (0.8f32 * line_count as f32) as _; + if common_count < prefix_threshold { + return input; + } + + // pass 2: remove that substring + let mut rem = String::with_capacity(input.len()); + for line in input.split('\n') { + rem.push_str(line.strip_prefix(most_common).unwrap_or(line).trim()); + rem.push('\n'); + } + + // pop trailing newline + rem.pop(); + rem.into() +} + +fn normalize_vertical_whitespace(input: Cow<'_, str>) -> Cow<'_, str> { + struct Rx { + misc: Regex, + num: Regex, + } + static RX: LazyLock = LazyLock::new(|| Rx { + misc: Regex::new(r"[\r\n\v\f]").unwrap(), + num: Regex::new(r"\n{3,}").unwrap(), + }); + + let mut out = input; + let rx = &RX; + out = rx.misc.replace_all_cow(out, "\n"); + rx.num.replace_all_cow(out, "\n\n") +} + +fn remove_punctuation(input: Cow<'_, str>) -> Cow<'_, str> { + static RX: LazyLock = LazyLock::new(|| Regex::new(r"[^\w\s]+").unwrap()); + + RX.replace_all_cow(input, "") +} + +fn lowercaseify(input: Cow<'_, str>) -> Cow<'_, str> { + input.to_lowercase().into() +} + +fn remove_title_line(input: Cow<'_, str>) -> Cow<'_, str> { + static RX: LazyLock = + LazyLock::new(|| Regex::new(r"^.*license( version \S+)?( copyright.*)?\n\n").unwrap()); + + RX.replace_all_cow(input, "") +} + +fn remove_copyright_statements(input: Cow<'_, str>) -> Cow<'_, str> { + static RX: LazyLock = LazyLock::new(|| { + Regex::new( + r"(?mx) + ( + # either a new paragraph, or the beginning of the text + empty lines + (\n\n|\A\n*) + # any number of lines starting with 'copyright' followed by a new paragraph + (^\x20*copyright.*?$)+ + \n\n + ) + | + ( + # or the very first line if it has 'copyright' in it + \A.*copyright.*$ + ) + | + ( + # or any lines that really look like a copyright statement + ^copyright (\s+(c|\d+))+ .*?$ + ) + ", + ) + .unwrap() + }); + + RX.replace_all_cow(input, "\n\n") +} + +fn collapse_whitespace(input: Cow<'_, str>) -> Cow<'_, str> { + static RX: LazyLock = LazyLock::new(|| Regex::new(r"\s+").unwrap()); + RX.replace_all_cow(input, " ") +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn trim_byte_adjusted_respects_multibyte_characters() { + let input = "RustКраб橙蟹🦀"; + let expected = [ + "", + "R", + "Ru", + "Rus", + "Rust", + "Rust", + "RustК", + "RustК", + "RustКр", + "RustКр", + "RustКра", + "RustКра", + "RustКраб", + "RustКраб", + "RustКраб", + "RustКраб橙", + "RustКраб橙", + "RustКраб橙", + "RustКраб橙蟹", + "RustКраб橙蟹", + "RustКраб橙蟹", + "RustКраб橙蟹", + "RustКраб橙蟹🦀", + ]; + + for (i, &outcome) in expected.iter().enumerate() { + assert_eq!(outcome, trim_byte_adjusted(input, i)); + } + } + + #[test] + fn greatest_substring_removal() { + // the funky string syntax \n\ is to add a newline but skip the + // leading whitespace in the source code + let text = "%%Copyright: Copyright\n\ + %%Copyright: All rights reserved.\n\ + %%Copyright: Redistribution and use in source and binary forms, with or\n\ + %%Copyright: without modification, are permitted provided that the\n\ + %%Copyright: following conditions are met:\n\ + \n\ + abcd"; + + let new_text = remove_common_tokens(text.into()); + println!("{}", new_text); + + assert!( + !new_text.contains("%%Copyright"), + "new text shouldn't contain the common substring" + ); + } + + #[test] + fn greatest_substring_removal_keep_inner() { + let text = "this string should still have\n\ + this word -> this <- in it even though\n\ + this is still the most common word"; + let new_text = remove_common_tokens(text.into()); + println!("-- {}", new_text); + // the "this" at the start of the line can be discarded... + assert!(!new_text.contains("\nthis")); + // ...but the "this" in the middle of sentences shouldn't be + assert!(new_text.contains("this")); + + let text = "aaaa bbbb cccc dddd\n\ + eeee ffff aaaa gggg\n\ + hhhh iiii jjjj"; + let new_text = remove_common_tokens(text.into()); + println!("-- {}", new_text); + assert!(new_text.contains("aaaa")); // similar to above test + } + + #[test] + fn greatest_substring_removal_42() { + // https://github.com/jpeddicord/askalono/issues/42 + let text = "AAAAAA line 1\n\ + AAAAAA another line here\n\ + AAAAAA yet another line here\n\ + AAAAAA how long will this go on\n\ + AAAAAA another line here\n\ + AAAAAA more\n\ + AAAAAA one more\n\ + AAAAAA two more\n\ + AAAAAA three more\n\ + AAAAAA four more\n\ + AAAAAA five more\n\ + AAAAAA six more\n\ + \n\ + preserve\n\ + keep"; + let new_text = remove_common_tokens(text.into()); + println!("{}", new_text); + + assert!(new_text.contains("preserve")); + assert!(new_text.contains("keep")); + assert!(!new_text.contains("AAAAAA")); + } + + #[test] + fn normalize_no_line_mangle() { + let text = "some license + + copyright 2012 person + + \tlicense\r + text + + \t + + + + goes + here"; + + let text_lines = text.lines().count(); + + let normalized = apply_normalizers(text); + let normalized_lines = normalized.len(); + + assert_eq!( + text_lines, normalized_lines, + "normalizers shouldnt change line counts" + ); + } +} diff --git a/src/detection/scan.rs b/src/detection/scan.rs new file mode 100644 index 0000000..2398271 --- /dev/null +++ b/src/detection/scan.rs @@ -0,0 +1,521 @@ +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use std::{borrow::Cow, fmt}; + +use crate::detection::{ + Store, + detect::Match, + license::{LicenseType, TextData}, +}; + +/// A struct describing a license that was identified, as well as its type. +#[derive(Copy, Clone)] +pub struct IdentifiedLicense<'a> { + /// The identifier of the license. + pub name: &'a str, + /// The type of the license that was matched. + pub kind: LicenseType, + /// A reference to the license data inside the store. + pub data: &'a TextData, +} + +impl<'a> fmt::Debug for IdentifiedLicense<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("IdentifiedLicense") + .field("name", &self.name) + .field("kind", &self.kind) + .finish() + } +} + +/// Information about scanned content. +/// +/// Produced by `ScanStrategy.scan`. +#[derive(Debug)] +pub struct ScanResult<'a> { + /// The confidence of the match from 0.0 to 1.0. + pub score: f32, + /// The identified license of the overall text, or None if nothing met the + /// confidence threshold. + pub license: Option>, + /// Any licenses discovered inside the text, if `optimize` was enabled. + pub containing: Vec>, +} + +/// A struct describing a single license identified within a larger text. +#[derive(Debug, Copy, Clone)] +pub struct ContainedResult<'a> { + /// The confidence of the match within the line range from 0.0 to 1.0. + pub score: f32, + /// The license identified in this portion of the text. + pub license: IdentifiedLicense<'a>, + /// A 0-indexed (inclusive, exclusive) range of line numbers identifying + /// where in the overall text a license was identified. + /// + /// See `TextData.lines_view()` for more information. + pub line_range: (usize, usize), +} + +/// A `ScanStrategy` can be used as a high-level wrapped over a `Store`'s +/// analysis logic. +/// +/// A strategy configured here can be run repeatedly to scan a document for +/// multiple licenses, or to automatically optimize to locate texts within a +/// larger text. +/// +/// # Examples +/// +/// ```rust,should_panic +/// # use std::error::Error; +/// use spdx::detection::{scan::Scanner, Store}; +/// +/// # fn main() -> Result<(), Box> { +/// let store = Store::new(); +/// // [...] +/// let strategy = Scanner::new(&store) +/// .confidence_threshold(0.9) +/// .optimize(true); +/// let results = strategy.scan(&"my text to scan".into()); +/// # Ok(()) +/// # } +/// ``` +pub struct Scanner<'a> { + store: &'a Store, + mode: ScanMode, + confidence_threshold: f32, + shallow_limit: f32, + optimize: bool, + max_passes: u16, +} + +/// Available scanning strategy modes. +pub enum ScanMode { + /// A general-purpose strategy that iteratively locates the + /// highest license match in a file, then the next, and so on until not + /// finding any more strong matches. + Elimination, + /// A strategy intended for use with attribution documents, or + /// text files containing multiple licenses (and not much else). + /// + /// It's more accurate than `Elimination`, but significantly slower. + TopDown { + /// A smaller step size will be more accurate at a significant cost of + /// speed. + /// + /// Defaults to 5. + step_size: usize, + }, +} + +impl ScanMode { + /// Creates a `TopDown` strategy with the default step size + #[inline] + pub fn top_down() -> Self { + Self::TopDown { step_size: 5 } + } +} + +impl<'a> Scanner<'a> { + /// Construct a new scanning strategy tied to the given `Store`. + /// + /// By default, the strategy has conservative defaults and won't perform + /// any deeper investigation into the contents of files. + #[inline] + pub fn new(store: &'a Store) -> Self { + Self::with_scan_mode(store, ScanMode::Elimination) + } + + /// Constructs a scanning strategy with the specified mode + #[inline] + pub fn with_scan_mode(store: &'a Store, mode: ScanMode) -> Self { + Self { + store, + mode, + confidence_threshold: 0.9, + shallow_limit: 0.99, + optimize: false, + max_passes: 10, + } + } +} + +impl Scanner<'_> { + /// Set the confidence threshold for this strategy. + /// + /// The overall license match must meet this number in order to be + /// reported. Additionally, if contained licenses are reported in the scan + /// (when `optimize` is enabled), they'll also need to meet this bar. + /// + /// Set this to 1.0 for only exact matches, and 0.0 to report even the + /// weakest match. + pub fn confidence_threshold(mut self, confidence_threshold: f32) -> Self { + self.confidence_threshold = confidence_threshold; + self + } + + /// Set a fast-exit parameter that allows the strategy to skip the rest of + /// a scan for strong matches. + /// + /// This should be set higher than the confidence threshold; ideally close + /// to 1.0. If the overall match score is above this limit, the scanner + /// will return early and not bother performing deeper checks. + /// + /// This is really only useful in conjunction with `optimize`. A value of + /// 0.0 will fast-return on any match meeting the confidence threshold, + /// while a value of 1.0 will only stop on a perfect match. + pub fn shallow_limit(mut self, shallow_limit: f32) -> Self { + self.shallow_limit = shallow_limit; + self + } + + /// Indicate whether a deeper scan should be performed. + /// + /// This is ignored if the shallow limit is met. It's not enabled by + /// default, however, so if you want deeper results you should set + /// `shallow_limit` fairly high and enable this. + pub fn optimize(mut self, optimize: bool) -> Self { + self.optimize = optimize; + self + } + + /// The maximum number of identifications to perform before exiting a scan + /// of a single text. + /// + /// This is largely to prevent misconfigurations and infinite loop + /// scenarios, but if you have a document with a large number of licenses + /// then you may want to tune this to a value above the number of licenses + /// you expect to be identified. + pub fn max_passes(mut self, max_passes: u16) -> Self { + self.max_passes = max_passes; + self + } + + /// Scan the given text content using this strategy's configured + /// preferences. + /// + /// Returns a `ScanResult` containing all discovered information. + #[inline] + pub fn scan(&'_ self, text: &TextData) -> ScanResult<'_> { + match self.mode { + ScanMode::Elimination => self.scan_elimination(text), + ScanMode::TopDown { step_size } => self.scan_topdown(text, step_size), + } + } + + fn scan_elimination(&'_ self, text: &TextData) -> ScanResult<'_> { + let mut analysis = self.store.analyze(text); + let score = analysis.score; + let mut license = None; + let mut containing = Vec::new(); + + // meets confidence threshold? record that + if analysis.score > self.confidence_threshold { + license = Some(IdentifiedLicense { + name: analysis.name, + kind: analysis.license_type, + data: analysis.data, + }); + + // above the shallow limit -> exit + if analysis.score > self.shallow_limit { + return ScanResult { + score, + license, + containing, + }; + } + } + + if !self.optimize { + return ScanResult { + score, + license, + containing, + }; + } + + // repeatedly try to dig deeper + // this loop effectively iterates once for each license it finds + let mut current_text: Cow<'_, TextData> = Cow::Borrowed(text); + for _n in 0..self.max_passes { + let (optimized, optimized_score) = current_text.optimize_bounds(analysis.data); + + // stop if we didn't find anything acceptable + if optimized_score < self.confidence_threshold { + break; + } + + // otherwise, save it + containing.push(ContainedResult { + score: optimized_score, + license: IdentifiedLicense { + name: analysis.name, + kind: analysis.license_type, + data: analysis.data, + }, + line_range: optimized.lines_view(), + }); + + // and white-out + reanalyze for next iteration + current_text = Cow::Owned(optimized.white_out()); + analysis = self.store.analyze(¤t_text); + } + + ScanResult { + score, + license, + containing, + } + } + + fn scan_topdown(&'_ self, text: &TextData, step_size: usize) -> ScanResult<'_> { + let (_, text_end) = text.lines_view(); + let mut containing = Vec::new(); + + // find licenses working down thru the text's lines + let mut current_start = 0usize; + while current_start < text_end { + let result = self.topdown_find_contained_license(text, current_start, step_size); + + let contained = match result { + Some(c) => c, + None => break, + }; + + current_start = contained.line_range.1 + 1; + containing.push(contained); + } + + ScanResult { + score: 0.0, + license: None, + containing, + } + } + + fn topdown_find_contained_license( + &'_ self, + text: &TextData, + starting_at: usize, + step_size: usize, + ) -> Option> { + let (_, text_end) = text.lines_view(); + let mut found: (usize, usize, Option>) = (0, 0, None); + + // speed: only start tracking once conf is met, and bail out after + let mut hit_threshold = false; + + // move the start of window... + 'start: for start in (starting_at..text_end).step_by(step_size) { + // ...and also the end of window to find high scores. + for end in (start..=text_end).step_by(step_size) { + let view = text.with_view(start, end); + let analysis = self.store.analyze(&view); + + // just getting a feel for the data at this point, not yet + // optimizing the view. + + // entering threshold: save the starting location + if !hit_threshold && analysis.score >= self.confidence_threshold { + hit_threshold = true; + } + + if hit_threshold { + if analysis.score < self.confidence_threshold { + // exiting threshold + break 'start; + } else { + // maintaining threshold (also true for entering) + found = (start, end, Some(analysis)); + } + } + } + } + + // at this point we have a *rough* bounds for a match. + // now we can optimize to find the best one + let matched = found.2?; + let check = matched.data; + let view = text.with_view(found.0, found.1); + let (optimized, optimized_score) = view.optimize_bounds(check); + + if optimized_score < self.confidence_threshold { + return None; + } + + Some(ContainedResult { + score: optimized_score, + license: IdentifiedLicense { + name: matched.name, + kind: matched.license_type, + data: matched.data, + }, + line_range: optimized.lines_view(), + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn can_construct() { + let store = Store::new(); + Scanner::new(&store); + Scanner::new(&store).confidence_threshold(0.5); + Scanner::new(&store) + .shallow_limit(0.99) + .optimize(true) + .max_passes(100); + } + + #[test] + fn shallow_scan() { + let store = create_dummy_store(); + let test_data = TextData::new("lorem ipsum\naaaaa bbbbb\nccccc\nhello"); + + // the above text should have a result with a confidence minimum of 0.5 + let strategy = Scanner::new(&store) + .confidence_threshold(0.5) + .shallow_limit(0.0); + let result = strategy.scan(&test_data); + assert!( + result.score > 0.5, + "score must meet threshold; was {}", + result.score + ); + assert_eq!( + result.license.expect("result has a license").name, + "license-1" + ); + + // but it won't pass with a threshold of 0.8 + let strategy = Scanner::new(&store) + .confidence_threshold(0.8) + .shallow_limit(0.0); + let result = strategy.scan(&test_data); + assert!(result.license.is_none(), "result license is None"); + } + + #[test] + fn single_optimize() { + let store = create_dummy_store(); + // this TextData matches license-2 with an overall score of ~0.46 and optimized + // score of ~0.57 + let test_data = TextData::new( + "lorem\nipsum abc def ghi jkl\n1234 5678 1234\n0000\n1010101010\n\n8888 9999\nwhatsit hello\narst neio qwfp colemak is the best keyboard layout", + ); + + // check that we can spot the gibberish license in the sea of other gibberish + let strategy = Scanner::new(&store) + .confidence_threshold(0.5) + .optimize(true) + .shallow_limit(1.0); + let result = strategy.scan(&test_data); + assert!(result.license.is_none(), "result license is None"); + assert_eq!(result.containing.len(), 1); + let contained = &result.containing[0]; + assert_eq!(contained.license.name, "license-2"); + assert!( + contained.score > 0.5, + "contained score is greater than threshold" + ); + } + + #[test] + fn find_multiple_licenses_elimination() { + let store = create_dummy_store(); + // this TextData matches license-2 with an overall score of ~0.46 and optimized + // score of ~0.57 + let test_data = TextData::new( + "lorem\nipsum abc def ghi jkl\n1234 5678 1234\n0000\n1010101010\n\n8888 9999\nwhatsit hello\narst neio qwfp colemak is the best keyboard layout\naaaaa\nbbbbb\nccccc", + ); + + // check that we can spot the gibberish license in the sea of other gibberish + let strategy = Scanner::new(&store) + .confidence_threshold(0.5) + .optimize(true) + .shallow_limit(1.0); + let result = strategy.scan(&test_data); + assert!(result.license.is_none(), "result license is None"); + assert_eq!(2, result.containing.len()); + + // inspect the array and ensure we got both licenses + let mut found1 = 0; + let mut found2 = 0; + for contained in &result.containing { + match contained.license.name { + "license-1" => { + assert!(contained.score > 0.5, "license-1 score meets threshold"); + found1 += 1; + } + "license-2" => { + assert!(contained.score > 0.5, "license-2 score meets threshold"); + found2 += 1; + } + _ => { + panic!("somehow got an unknown license name"); + } + } + } + + assert!( + found1 == 1 && found2 == 1, + "found both licenses exactly once" + ); + } + + #[test] + fn find_multiple_licenses_topdown() { + let store = create_dummy_store(); + // this TextData matches license-2 with an overall score of ~0.46 and optimized + // score of ~0.57 + let test_data = TextData::new( + "lorem\nipsum abc def ghi jkl\n1234 5678 1234\n0000\n1010101010\n\n8888 9999\nwhatsit hello\narst neio qwfp colemak is the best keyboard layout\naaaaa\nbbbbb\nccccc", + ); + + // check that we can spot the gibberish license in the sea of other gibberish + let strategy = Scanner::with_scan_mode(&store, ScanMode::TopDown { step_size: 1 }) + .confidence_threshold(0.5); + let result = strategy.scan(&test_data); + assert!(result.license.is_none(), "result license is None"); + println!("{:?}", result); + assert_eq!(2, result.containing.len()); + + // inspect the array and ensure we got both licenses + let mut found1 = 0; + let mut found2 = 0; + for contained in &result.containing { + match contained.license.name { + "license-1" => { + assert!(contained.score > 0.5, "license-1 score meets threshold"); + found1 += 1; + } + "license-2" => { + assert!(contained.score > 0.5, "license-2 score meets threshold"); + found2 += 1; + } + _ => { + panic!("somehow got an unknown license name"); + } + } + } + + assert!( + found1 == 1 && found2 == 1, + "found both licenses exactly once" + ); + } + + fn create_dummy_store() -> Store { + let mut store = Store::new(); + store.add_license("license-1".into(), "aaaaa\nbbbbb\nccccc".into()); + store.add_license( + "license-2".into(), + "1234 5678 1234\n0000\n1010101010\n\n8888 9999".into(), + ); + store + } +} diff --git a/src/expression.rs b/src/expression.rs index a326f20..8dfd267 100644 --- a/src/expression.rs +++ b/src/expression.rs @@ -8,11 +8,14 @@ pub use minimize::MinimizeError; use smallvec::SmallVec; use std::fmt; -/// A license requirement inside an SPDX license expression, including -/// the span in the expression where it is located +/// A license requirement inside an SPDX license expression +/// +/// Inclueds the span in the expression where it is located #[derive(Debug, Clone)] pub struct ExpressionReq { + /// The license requirement pub req: LicenseReq, + /// The span in the original license expression string containing the requirement pub span: std::ops::Range, } @@ -25,13 +28,18 @@ impl PartialEq for ExpressionReq { /// The joining operators supported by SPDX 2.1 #[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Copy, Clone)] pub enum Operator { + /// Conjunctive `AND|and` operator that combines two valid license expressions And, + /// Disjunctive `OR|or` operator that combines two valid license expressions Or, } +/// An expression node #[derive(Debug, Clone, PartialEq)] pub enum ExprNode { + /// An operator Op(Operator), + /// A requirement Req(ExpressionReq), } diff --git a/src/lexer.rs b/src/lexer.rs index 70bb4d4..f1a2b3a 100644 --- a/src/lexer.rs +++ b/src/lexer.rs @@ -70,14 +70,18 @@ pub enum Token<'a> { Spdx(LicenseId), /// A `LicenseRef-` prefixed id, with an optional `DocumentRef-` LicenseRef { + /// An optional document reference doc_ref: Option<&'a str>, + /// The name of the license reference lic_ref: &'a str, }, /// A recognized SPDX exception id Exception(ExceptionId), /// A `AdditionRef-` prefixed id, with an optional `DocumentRef-` AdditionRef { + /// An optional document reference doc_ref: Option<&'a str>, + /// The name of the addition reference add_ref: &'a str, }, /// A postfix `+` indicating "or later" for a particular SPDX license id diff --git a/src/lib.rs b/src/lib.rs index 7403e6a..f0e56b4 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,13 +1,24 @@ +#![cfg_attr(docsrs, feature(doc_cfg))] +#![deny(missing_docs)] +#![doc = include_str!("../README.md")] + /// Error types pub mod error; pub mod expression; /// Auto-generated lists of license identifiers and exception identifiers +#[allow(missing_docs)] pub mod identifiers; /// Contains types for lexing an SPDX license expression pub mod lexer; mod licensee; + +/// Allows analysis of text to determine if it resembles a license +#[cfg(feature = "detection")] +pub mod detection; + /// Auto-generated full canonical text of each license #[cfg(feature = "text")] +#[allow(missing_docs)] pub mod text; pub use error::ParseError; @@ -19,7 +30,9 @@ use std::{ fmt, }; +/// Flags that can apply to licenses and/or license exceptions pub mod flags { + /// Inner type of the flags pub type Type = u8; /// Whether the license is listed as free by the [Free Software Foundation](https://www.gnu.org/licenses/license-list.en.html) @@ -319,13 +332,17 @@ impl fmt::Display for LicenseReq { } } +/// SPDX allows the use of `LicenseRef-` to provide +/// arbitrary licenses that aren't a part of the official SPDX license list #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)] pub struct LicenseRef { - /// Purpose: Identify any external SPDX documents referenced within this SPDX document. + /// Identify any external SPDX documents referenced within this SPDX document. + /// /// See the [spec](https://spdx.org/spdx-specification-21-web-version#h.h430e9ypa0j9) for /// more details. pub doc_ref: Option, - /// Purpose: Provide a locally unique identifier to refer to licenses that are not found on the SPDX License List. + /// Provide a locally unique identifier to refer to licenses that are not found on the SPDX License List. + /// /// See the [spec](https://spdx.org/spdx-specification-21-web-version#h.4f1mdlm) for /// more details. pub lic_ref: String, @@ -348,11 +365,14 @@ impl fmt::Display for LicenseRef { pub enum LicenseItem { /// A regular SPDX license id Spdx { + /// The license identifier id: LicenseId, /// Indicates the license had a `+`, allowing the licensee to license /// the software under either the specific version, or any later versions or_later: bool, }, + /// SPDX allows the use of `LicenseRef-` to provide + /// arbitrary licenses that aren't a part of the official SPDX license list Other(Box), } @@ -432,6 +452,8 @@ impl fmt::Display for LicenseItem { } } +/// A user supplied `AddtionRef-` to specify additional text to +/// associate with a license that falls outside the SPDX license list #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)] pub struct AdditionRef { /// Purpose: Identify any external SPDX documents referenced within this SPDX document. @@ -461,6 +483,8 @@ impl fmt::Display for AdditionRef { pub enum AdditionItem { /// A regular SPDX license exception id Spdx(ExceptionId), + /// A user supplied `AddtionRef-` to specify additional text to + /// associate with a license that falls outside the SPDX license list Other(Box), } diff --git a/src/licensee.rs b/src/licensee.rs index d7ff5f3..6377d55 100644 --- a/src/licensee.rs +++ b/src/licensee.rs @@ -234,6 +234,7 @@ impl Licensee { req.addition == self.inner.addition } + /// Converts this [`Self`] into a [`LicenseReq`] #[must_use] pub fn into_req(self) -> LicenseReq { self.inner diff --git a/tests/detection.rs b/tests/detection.rs new file mode 100644 index 0000000..275ce72 --- /dev/null +++ b/tests/detection.rs @@ -0,0 +1,30 @@ +#![cfg(feature = "detection")] + +#[cfg(feature = "detection-inline-cache")] +#[test] +fn reads_inline_cache() { + let store = spdx::detection::Store::load_inline().expect("failed to load cache"); + + let mut set = std::collections::BTreeSet::new(); + + for (k, v) in store.iter() { + set.insert(k.as_str()); + + for alias in &v.aliases { + set.insert(alias.as_str()); + } + } + + // We manually add the NOASSERTION "fake" license id since it's not part of + // SPDX, but might be in the future https://github.com/spdx/spdx-spec/issues/50 + // so that should be the only license that isn't present in the store + for lic in spdx::identifiers::LICENSES { + if lic.name != "NOASSERTION" { + assert!( + set.contains(lic.name), + "failed to find expected license {} in inline cache store", + lic.name + ); + } + } +} diff --git a/update/Cargo.toml b/update/Cargo.toml index f5a4c46..ecf9b69 100644 --- a/update/Cargo.toml +++ b/update/Cargo.toml @@ -8,3 +8,4 @@ publish = false [dependencies] anyhow = "1.0" serde_json = "1.0.48" +spdx = { path = "..", features = ["detection-cache"] } diff --git a/update/src/main.rs b/update/src/main.rs index 0ecb246..fe3271a 100644 --- a/update/src/main.rs +++ b/update/src/main.rs @@ -314,6 +314,88 @@ use crate::{{Exception, License, flags::*}}; write_license_texts(texts, v.into_iter().map(|(name, _, _)| name)) } +fn write_cache() -> anyhow::Result<()> { + let json: Map = serde_json::from_str( + &std::fs::read_to_string(format!("{ROOT}/json/licenses.json")) + .context("unable to open licenses.json")?, + ) + .context("failed to deserialize licenses.json")?; + + let licenses = get(&json, "licenses")?; + let licenses = if let Value::Array(v) = licenses { + v + } else { + bail!("Malformed JSON: {licenses:?}") + }; + + use spdx::detection as sd; + + let mut texts = std::collections::BTreeMap::::new(); + + for lic in licenses.iter() { + let lic = if let Value::Object(ref m) = *lic { + m + } else { + bail!("Malformed JSON: {lic:?}") + }; + + let lic_id = get(lic, "licenseId")?.as_str().context("licenseId was not a string")?; + + let details: Map = serde_json::from_str(&std::fs::read_to_string(format!("{ROOT}/json/details/{lic_id}.json")).with_context(|| format!("failed to read license details for {lic_id}"))?).with_context(|| format!("failed to deserialize details for {lic_id}"))?; + + let text = get(&details, "licenseText")?.as_str().context("licenseText was not a string")?; + + let content = sd::TextData::new(text); + + let mut already_existed = false; + for (name, v) in &mut texts { + if !v.original.ngram_matches(&content) { + continue; + } + + v.aliases.push(lic_id.to_owned()); + println!("{lic_id} already stored; added as an alias for {name}"); + + if lic_id.starts_with("GFDL-") { + if let Some(id) = lic_id.strip_suffix("-invariants-only") { + v.aliases.push(format!("{id}-invariants")); + } + } + + already_existed = true; + } + + if already_existed { + continue; + } + + let license = texts + .entry(lic_id.to_owned()) + .or_insert_with(|| sd::LicenseEntry::new(content)); + + if let Some(header_text) = details.get("standardLicenseHeader").and_then(|h| h.as_str()) { + license.headers.push(sd::TextData::new(header_text)); + } + + if lic_id.starts_with("GFDL-") { + if let Some(id) = lic_id.strip_suffix("-invariants-only") { + license.aliases.push(format!("{id}-invariants")); + } + } + } + + let mut s = sd::Store::new(); + for (key, entry) in texts { + s.insert_entry(key, entry); + } + + let mut f = std::fs::File::create("src/detection/cache.bin.zstd")?; + s.to_cache(&mut f).context("failed to store cache")?; + f.flush().context("failed to flush cache to disk")?; + + Ok(()) +} + fn real_main() -> Result<()> { let mut upstream_tag = None; let mut debug = false; @@ -377,6 +459,8 @@ fn real_main() -> Result<()> { .success() ); + let t = std::thread::spawn(write_cache); + { let mut identifiers = io::BufWriter::new(std::fs::File::create("src/identifiers.rs")?); @@ -448,6 +532,7 @@ fn real_main() -> Result<()> { .write(&readme.as_bytes()[end_index..]) .context("failed to write suffix")?; + t.join().unwrap()?; Ok(()) }