From 98013c804f7c425de100f2a9ab1bddd5cac318df Mon Sep 17 00:00:00 2001 From: Tijs-B <14929648+Tijs-B@users.noreply.github.com> Date: Tue, 14 Oct 2025 23:20:10 +0200 Subject: [PATCH] feat(gitcommit): use tree-sitter for gitcommit The previous gitcommit parser naively took the contents until the first comment. If there are inline comments, such as when squashing two commits in an interactive rebase, the text after those comments would not be checked. Instead, we use tree-sitter-gitcommit parsing and mask the nodes that are subject, message lines or breaking change descriptions. As before, we re-parse these parts with the existing markdown parser. A new crate `harper-git-commit` is added. --- Cargo.lock | 23 ++++++++++- Cargo.toml | 2 +- harper-git-commit/Cargo.toml | 16 ++++++++ harper-git-commit/src/lib.rs | 30 ++++++++++++++ harper-git-commit/tests/run_tests.rs | 41 +++++++++++++++++++ .../test_sources/complex_verbose_commit.txt | 26 ++++++++++++ .../test_sources/conventional_commit.txt | 17 ++++++++ .../tests/test_sources/simple_commit.txt | 1 + harper-ls/Cargo.toml | 1 + harper-ls/src/backend.rs | 6 +-- harper-ls/src/git_commit_parser.rs | 32 --------------- harper-ls/src/main.rs | 2 +- 12 files changed, 158 insertions(+), 39 deletions(-) create mode 100644 harper-git-commit/Cargo.toml create mode 100644 harper-git-commit/src/lib.rs create mode 100644 harper-git-commit/tests/run_tests.rs create mode 100644 harper-git-commit/tests/test_sources/complex_verbose_commit.txt create mode 100644 harper-git-commit/tests/test_sources/conventional_commit.txt create mode 100644 harper-git-commit/tests/test_sources/simple_commit.txt delete mode 100644 harper-ls/src/git_commit_parser.rs diff --git a/Cargo.lock b/Cargo.lock index ba1b0cbc4..8d594f956 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -781,7 +781,7 @@ checksum = "fe6d2e5af09e8c8ad56c969f2157a3d4238cebc7c55f0a517728c38f7b200f81" dependencies = [ "serde", "termcolor", - "unicode-width 0.1.14", + "unicode-width 0.2.2", ] [[package]] @@ -2383,6 +2383,17 @@ dependencies = [ "unicode-width 0.2.2", ] +[[package]] +name = "harper-git-commit" +version = "0.68.0" +dependencies = [ + "harper-core", + "harper-tree-sitter", + "paste", + "tree-sitter", + "tree-sitter-gitcommit", +] + [[package]] name = "harper-html" version = "0.68.0" @@ -2427,6 +2438,7 @@ dependencies = [ "globset", "harper-comments", "harper-core", + "harper-git-commit", "harper-html", "harper-ink", "harper-literate-haskell", @@ -4832,6 +4844,15 @@ dependencies = [ "tree-sitter-language", ] +[[package]] +name = "tree-sitter-gitcommit" +version = "0.3.3" +source = "git+https://github.com/gbprod/tree-sitter-gitcommit?rev=a716678c0f00645fed1e6f1d0eb221481dbd6f6d#a716678c0f00645fed1e6f1d0eb221481dbd6f6d" +dependencies = [ + "cc", + "tree-sitter", +] + [[package]] name = "tree-sitter-go" version = "0.25.0" diff --git a/Cargo.toml b/Cargo.toml index 9eab182db..27b1a9109 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,5 +1,5 @@ [workspace] -members = ["harper-cli", "harper-core", "harper-ls", "harper-comments", "harper-wasm", "harper-tree-sitter", "harper-html", "harper-literate-haskell", "harper-typst", "harper-stats", "harper-pos-utils", "harper-brill", "harper-ink", "harper-python"] +members = ["harper-cli", "harper-core", "harper-ls", "harper-comments", "harper-wasm", "harper-tree-sitter", "harper-html", "harper-literate-haskell", "harper-typst", "harper-stats", "harper-pos-utils", "harper-brill", "harper-ink", "harper-python", "harper-git-commit"] resolver = "2" # Comment out the below lines if you plan to use a debugger. diff --git a/harper-git-commit/Cargo.toml b/harper-git-commit/Cargo.toml new file mode 100644 index 000000000..6d472985d --- /dev/null +++ b/harper-git-commit/Cargo.toml @@ -0,0 +1,16 @@ +[package] +name = "harper-git-commit" +version = "0.68.0" +edition = "2024" +description = "The language checker for developers." +license = "Apache-2.0" +repository = "https://github.com/automattic/harper" + +[dependencies] +harper-core = { path = "../harper-core", version = "0.68.0" } +harper-tree-sitter = { path = "../harper-tree-sitter", version = "0.68.0" } +tree-sitter-gitcommit = { git = "https://github.com/gbprod/tree-sitter-gitcommit", rev = "a716678c0f00645fed1e6f1d0eb221481dbd6f6d" } +tree-sitter = "0.25.10" + +[dev-dependencies] +paste = "1.0.15" diff --git a/harper-git-commit/src/lib.rs b/harper-git-commit/src/lib.rs new file mode 100644 index 000000000..67ea7e9a7 --- /dev/null +++ b/harper-git-commit/src/lib.rs @@ -0,0 +1,30 @@ +use harper_core::Token; +use harper_core::parsers::{self, Markdown, MarkdownOptions, Parser}; +use harper_tree_sitter::TreeSitterMasker; +use tree_sitter::Node; + +pub struct GitCommitParser { + /// Used to grab the text nodes, and parse them as markdown. + inner: parsers::Mask, +} + +impl GitCommitParser { + fn node_condition(n: &Node) -> bool { + matches!(n.kind(), "subject" | "message_line" | "breaking_change") + } + + pub fn new(markdown_options: MarkdownOptions) -> Self { + Self { + inner: parsers::Mask::new( + TreeSitterMasker::new(tree_sitter_gitcommit::language(), Self::node_condition), + Markdown::new(markdown_options), + ), + } + } +} + +impl Parser for GitCommitParser { + fn parse(&self, source: &[char]) -> Vec { + self.inner.parse(source) + } +} diff --git a/harper-git-commit/tests/run_tests.rs b/harper-git-commit/tests/run_tests.rs new file mode 100644 index 000000000..aa872f602 --- /dev/null +++ b/harper-git-commit/tests/run_tests.rs @@ -0,0 +1,41 @@ +use harper_core::linting::{LintGroup, Linter}; +use harper_core::parsers::MarkdownOptions; +use harper_core::spell::FstDictionary; +use harper_core::{Dialect, Document}; +use harper_git_commit::GitCommitParser; + +/// Creates a unit test checking that the linting of a git commit document (in +/// `tests_sources`) produces the expected number of lints. +macro_rules! create_test { + ($filename:ident.txt, $correct_expected:expr) => { + paste::paste! { + #[test] + fn [](){ + let source = include_str!( + concat!( + "./test_sources/", + concat!(stringify!($filename), ".txt") + ) + ); + + let dict = FstDictionary::curated(); + let document = Document::new(source, &GitCommitParser::new(MarkdownOptions::default()), &dict); + + let mut linter = LintGroup::new_curated(dict, Dialect::American); + let lints = linter.lint(&document); + + dbg!(&lints); + assert_eq!(lints.len(), $correct_expected); + + // Make sure that all generated tokens span real characters + for token in document.tokens(){ + assert!(token.span.try_get_content(document.get_source()).is_some()); + } + } + } + }; +} + +create_test!(simple_commit.txt, 1); +create_test!(complex_verbose_commit.txt, 2); +create_test!(conventional_commit.txt, 3); diff --git a/harper-git-commit/tests/test_sources/complex_verbose_commit.txt b/harper-git-commit/tests/test_sources/complex_verbose_commit.txt new file mode 100644 index 000000000..cb369672b --- /dev/null +++ b/harper-git-commit/tests/test_sources/complex_verbose_commit.txt @@ -0,0 +1,26 @@ +This is the the subject + +This is a first line without typos +# This is a comment with a typooo that should be ignored +This is a line below the comment with typooos + +# Please enter the commit message for your changes. Lines starting +# with '#' will be ignored, and an empty message aborts the commit. +# +# On branch main +# +# Initial commit +# +# Changes to be committed: +#new file: myfile.txt +# +# ------------------------ >8 ------------------------ +# Do not modify or remove the line above. +# Everything below it will be ignored. +diff --git a/myfile.txt b/myfile.txt +new file mode 100644 +index 0000000..485c415 +--- /dev/null ++++ b/myfile.txt +@@ -0,0 +1 @@ ++some textt in the file that was added (the typo should be ignored) diff --git a/harper-git-commit/tests/test_sources/conventional_commit.txt b/harper-git-commit/tests/test_sources/conventional_commit.txt new file mode 100644 index 000000000..1f16b0b69 --- /dev/null +++ b/harper-git-commit/tests/test_sources/conventional_commit.txt @@ -0,0 +1,17 @@ +feat(stuff): use session-based authentiation + +BREAKING CHANGE: JWT authentication removed. API clients mustt now use +session cookies instead of Authorization headers with bearer tokens. + +Sessions expire after 24 hours of inactvity. + +Closes: #247 +Reviewed-by: John Doe + +# Please enter the commit message for your changes. Lines starting +# with '#' will be ignored, and an empty message aborts the commit. +# +# On branch main +# Changes to be committed: +# modified: myfile.txt +# diff --git a/harper-git-commit/tests/test_sources/simple_commit.txt b/harper-git-commit/tests/test_sources/simple_commit.txt new file mode 100644 index 000000000..eb8a4ac79 --- /dev/null +++ b/harper-git-commit/tests/test_sources/simple_commit.txt @@ -0,0 +1 @@ +A simple commit with a typo: comit diff --git a/harper-ls/Cargo.toml b/harper-ls/Cargo.toml index 509995e4a..2def5b440 100644 --- a/harper-ls/Cargo.toml +++ b/harper-ls/Cargo.toml @@ -12,6 +12,7 @@ harper-stats = { path = "../harper-stats", version = "0.68.0" } harper-literate-haskell = { path = "../harper-literate-haskell", version = "0.68.0" } harper-core = { path = "../harper-core", version = "0.68.0", features = ["concurrent"] } harper-comments = { path = "../harper-comments", version = "0.68.0" } +harper-git-commit = { path = "../harper-git-commit", version = "0.68.0" } harper-typst = { path = "../harper-typst", version = "0.68.0" } harper-html = { path = "../harper-html", version = "0.68.0" } harper-python = { path = "../harper-python", version = "0.68.0" } diff --git a/harper-ls/src/backend.rs b/harper-ls/src/backend.rs index a02e62db1..8095857ab 100644 --- a/harper-ls/src/backend.rs +++ b/harper-ls/src/backend.rs @@ -7,7 +7,6 @@ use std::sync::Arc; use crate::config::Config; use crate::dictionary_io::{load_dict, save_dict}; use crate::document_state::DocumentState; -use crate::git_commit_parser::GitCommitParser; use crate::ignored_lints_io::{load_ignored_lints, save_ignored_lints}; use crate::io_utils::fileify_path; use anyhow::{Context, Result, anyhow}; @@ -19,6 +18,7 @@ use harper_core::parsers::{ }; use harper_core::spell::{Dictionary, FstDictionary, MergedDictionary, MutableDictionary}; use harper_core::{Dialect, DictWordMetadata, Document, IgnoredLints}; +use harper_git_commit::GitCommitParser; use harper_html::HtmlParser; use harper_ink::InkParser; use harper_literate_haskell::LiterateHaskellParser; @@ -382,9 +382,7 @@ impl Backend { } "ink" => Some(Box::new(InkParser::default())), "markdown" => Some(Box::new(Markdown::new(markdown_options))), - "git-commit" | "gitcommit" => { - Some(Box::new(GitCommitParser::new_markdown(markdown_options))) - } + "git-commit" | "gitcommit" => Some(Box::new(GitCommitParser::new(markdown_options))), "html" => Some(Box::new(HtmlParser::default())), "mail" | "plaintext" | "text" => Some(Box::new(PlainEnglish)), "typst" => Some(Box::new(Typst)), diff --git a/harper-ls/src/git_commit_parser.rs b/harper-ls/src/git_commit_parser.rs deleted file mode 100644 index 4c1bdb40e..000000000 --- a/harper-ls/src/git_commit_parser.rs +++ /dev/null @@ -1,32 +0,0 @@ -use harper_core::Lrc; -use harper_core::parsers::{Markdown, MarkdownOptions, Parser}; - -/// A Harper parser for Git commit files -#[derive(Clone)] -pub struct GitCommitParser { - inner: Lrc, -} - -impl GitCommitParser { - pub fn new(parser: Lrc) -> Self { - Self { inner: parser } - } - - pub fn new_markdown(markdown_options: MarkdownOptions) -> Self { - Self::new(Lrc::new(Markdown::new(markdown_options))) - } -} - -impl Parser for GitCommitParser { - /// Admittedly a somewhat naive implementation. - /// We're going to get _something_ to work, before we polish it off. - fn parse(&self, source: &[char]) -> Vec { - // Locate the first `#` - let end = source - .iter() - .position(|c| *c == '#') - .unwrap_or(source.len()); - - self.inner.parse(&source[0..end]) - } -} diff --git a/harper-ls/src/main.rs b/harper-ls/src/main.rs index 3c34fc64d..1331b8ff4 100644 --- a/harper-ls/src/main.rs +++ b/harper-ls/src/main.rs @@ -9,7 +9,7 @@ mod config; mod diagnostics; mod dictionary_io; mod document_state; -mod git_commit_parser; + mod ignored_lints_io; mod io_utils; mod pos_conv;