diff --git a/crates/redos-wasm/src/lib.rs b/crates/redos-wasm/src/lib.rs index 8aa85e0..000ef9e 100644 --- a/crates/redos-wasm/src/lib.rs +++ b/crates/redos-wasm/src/lib.rs @@ -11,11 +11,7 @@ pub fn ir(regex: &str) -> String { let parser = Parser::parse(regex); format!( "{:#?}", - parser.map(|tree| redos::ir::to_expr( - &tree.expr, - &Default::default(), - nonzero_lit::usize!(1) - )) + parser.map(|tree| redos::ir::to_expr(&tree.expr, &Default::default())) ) } diff --git a/crates/redos/src/ilq.rs b/crates/redos/src/ilq.rs index b2da2c7..2f4f1d0 100644 --- a/crates/redos/src/ilq.rs +++ b/crates/redos/src/ilq.rs @@ -50,7 +50,7 @@ pub fn scan_ilq(expr: &Expr) -> IlqReturn { // a required token. scan_ilq_concat(list) - }, + } Expr::Group(e, _) => scan_ilq(e), // a repeating token? interesting.. we'll need to scan the child @@ -72,7 +72,6 @@ enum ConcatResults { } fn scan_ilq_concat(exprs: &Vec) -> IlqReturn { - // first, lets try to hit a repeat token for expr in exprs { let result: ConcatResults = match expr { diff --git a/crates/redos/src/ir.rs b/crates/redos/src/ir.rs index a9c1d09..a02f2d5 100644 --- a/crates/redos/src/ir.rs +++ b/crates/redos/src/ir.rs @@ -1,7 +1,7 @@ //! Intermediate representation of a regular expression. //! Used to simplify the AST and make it easier to work with. -use std::num::NonZeroUsize; +use std::{num::NonZeroUsize, rc::Rc}; use fancy_regex::{Assertion, Expr as RegexExpr, LookAround}; @@ -29,51 +29,112 @@ pub enum ExprConditional { BackrefExistsCondition(usize), } +fn option_rc(option: Option) -> Option> { + option.map(|x| Rc::new(x)) +} + #[derive(Debug, PartialEq, Eq, Clone)] pub struct ExprNode { - current: Expr, - previous: Option>, - next: Option>, - parent: Option> + current: Expr, + previous: Option>, + next: Option>, + parent: Option>, +} + +impl ExprNode { + /// Helper function that creates a leaf for the IR generation + fn new_leaf(current: Expr, previous: Option, parent: Option) -> ExprNode { + ExprNode { + current, + previous: option_rc(previous), + next: None, + parent: option_rc(parent), + } + } + + /// Helper function that produces a dummy value + fn dummy() -> ExprNode { + ExprNode { + current: Expr::Token(Token { + yes: vec![], + no: vec![], + }), + previous: None, + next: None, + parent: None, + } + } } #[derive(Debug, PartialEq, Eq, Clone)] pub enum Value { - Singular(String), - Range(String, String) + Singular(String), + Range(String, String), } #[derive(Debug, PartialEq, Eq, Clone)] pub struct Token { - /// Singular tokens that can be matched in this token - yes: Vec, - /// Singular tokens that can't be matched in this token - no: Vec + /// Singular tokens that can be matched in this token + yes: Vec, + /// Singular tokens that can't be matched in this token + no: Vec, } impl Token { - /// Creates a new token. - /// Takes in a basic regex that is either a single character - /// or a character class. - fn new(regex: &str) -> Token { - if !(regex.contains('[') || regex.contains(']')) { - // This isn't a character class - just a single character - Token { - yes: vec![Value::Singular(regex.to_string())], - no: vec![] - } - } else { - unimplemented!("No support for parsing character classes yet.") + /// Creates a new token. + /// Takes in a basic regex that is either a single character + /// or a character class. + fn new(regex: &str) -> Token { + if !(regex.contains('[') || regex.contains(']')) { + // This isn't a character class - just a single character + Token { + yes: vec![Value::Singular(regex.to_string())], + no: vec![], + } + } else { + unimplemented!("No support for parsing character classes yet.") + } + } + + fn new_ignore_case(regex: &str) -> Token { + unimplemented!("Can not ignore case when creating tokens yet.") + } + + fn overlaps(&self, token: &Token) -> bool { + unimplemented!("Can not detect overlapping tokens yet.") } - } +} + +fn container( + previous: Option, + parent: Option, + group_increment: NonZeroUsize, + config: &VulnerabilityConfig, + expr: &RegexExpr +) -> Option { + let mut node = ExprNode::new_leaf( + Expr::Group(Box::new(ExprNode::dummy()), group_increment.into()), + previous, + parent, + ); - fn new_ignore_case(regex: &str) -> Token { - unimplemented!("Can not ignore case when creating tokens yet.") - } + let nest: Option = to_nested_expr( + expr, + config, + group_increment + .checked_add(1) + .expect("group increment overflow"), + Some(node), + Some(node), + ); - fn overlaps(&self, token: &Token) -> bool { - unimplemented!("Can not detect overlapping tokens yet.") - } + if nest.is_none() { + return None; + } + + node.current = Expr::Group(Box::new(nest.unwrap()), group_increment.into()); + + Some(node) } #[derive(Debug, PartialEq, Eq, Clone)] @@ -84,93 +145,158 @@ pub enum Expr { Assertion(IrAssertion), /// Concatenation of multiple expressions, must match in order, e.g. `a.` is a concatenation of /// the literal `a` and `.` for any character - Concat(Vec), + Concat(Vec), /// Alternative of multiple expressions, one of them must match, e.g. `a|b` is an alternative /// where either the literal `a` or `b` must match - Alt(Vec), + Alt(Vec), /// Capturing group of expression, e.g. `(a.)` matches `a` and any character and "captures" /// (remembers) the match /// /// The usize is the number of the capturing group, starting from 1 - Group(Box, usize), + Group(Box, usize), /// Look-around (e.g. positive/negative look-ahead or look-behind) with an expression, e.g. /// `(?=a)` means the next character must be `a` (but the match is not consumed) - LookAround(Box, LookAround), + LookAround(Box, LookAround), /// Some large repeat of an expression. // Implementation Note: Greedy does not matter as if it doesn't match (in the case of ReDoS abuse), // greedy will not affect its matching because of the terminal token. - Repeat(Box), + Repeat(Box), /// Optional expression, e.g. `a?` means `a` is optional - Optional(Box), + Optional(Box), /// Atomic non-capturing group, e.g. `(?>ab|a)` in text that contains `ab` will match `ab` and /// never backtrack and try `a`, even if matching fails after the atomic group. - AtomicGroup(Box), + AtomicGroup(Box), /// If/Then/Else Condition. If there is no Then/Else, these will just be empty expressions. Conditional { /// The conditional expression to evaluate condition: ExprConditional, /// What to execute if the condition is true - true_branch: Box, + true_branch: Box, /// What to execute if the condition is false - false_branch: Box, + false_branch: Box, }, } -/// Converts a fancy-regex AST to an IR AST -pub fn to_expr( +pub fn to_expr(expr: &RegexExpr, config: &VulnerabilityConfig) -> Option { + to_nested_expr(expr, config, nonzero_lit::usize!(1), None, None) +} + +fn to_nested_expr( expr: &RegexExpr, config: &VulnerabilityConfig, group_increment: NonZeroUsize, -) -> Option { + parent: Option, + previous: Option, +) -> Option { match expr { RegexExpr::Empty => None, - RegexExpr::Any { newline } => Some(Expr::Token(if *newline { - Token::new(".") - } else { - Token { - yes: vec![Value::Singular(".".to_string())], - no: vec![Value::Singular("\\n".to_string())] - } - })), - RegexExpr::Assertion(a) => Some(Expr::Assertion(match a { - // Since start and line only depend on the multiline flag, - // they don't particurally matter for ReDoS detection. - Assertion::StartText => IrAssertion::Start, - Assertion::EndText => IrAssertion::End, - Assertion::StartLine { .. } => IrAssertion::Start, - Assertion::EndLine { .. } => IrAssertion::End, - - Assertion::LeftWordBoundary => IrAssertion::LeftWordBoundary, - Assertion::RightWordBoundary => IrAssertion::RightWordBoundary, - Assertion::WordBoundary => IrAssertion::WordBoundary, - Assertion::NotWordBoundary => IrAssertion::NotWordBoundary, - })), - RegexExpr::Literal { casei, val } => Some(Expr::Token(if *casei { - Token::new_ignore_case(val) - } else { - Token::new(val) - })), - // TODO: propagate group increment - RegexExpr::Concat(list) => Some(Expr::Concat( - list.iter() - .filter_map(|e| to_expr(e, config, group_increment)) - .collect(), + RegexExpr::Any { newline } => Some(ExprNode::new_leaf( + Expr::Token(if *newline { + Token::new(".") + } else { + Token { + yes: vec![Value::Singular(".".to_string())], + no: vec![Value::Singular("\\n".to_string())], + } + }), + previous, + parent, )), - RegexExpr::Alt(list) => Some(Expr::Alt( - list.iter() - .filter_map(|e| to_expr(e, config, group_increment)) - .collect(), + RegexExpr::Assertion(a) => Some(ExprNode::new_leaf( + Expr::Assertion(match a { + // Since start and line only depend on the multiline flag, + // they don't particurally matter for ReDoS detection. + Assertion::StartText => IrAssertion::Start, + Assertion::EndText => IrAssertion::End, + Assertion::StartLine { .. } => IrAssertion::Start, + Assertion::EndLine { .. } => IrAssertion::End, + + Assertion::LeftWordBoundary => IrAssertion::LeftWordBoundary, + Assertion::RightWordBoundary => IrAssertion::RightWordBoundary, + Assertion::WordBoundary => IrAssertion::WordBoundary, + Assertion::NotWordBoundary => IrAssertion::NotWordBoundary, + }), + previous, + parent, )), - RegexExpr::Group(e) => to_expr( - e, - config, - group_increment - .checked_add(1) - .expect("group increment overflow"), - ) - .map(|e| Expr::Group(Box::new(e), group_increment.into())), + RegexExpr::Literal { casei, val } => Some(ExprNode::new_leaf( + Expr::Token(if *casei { + Token::new_ignore_case(val) + } else { + Token::new(val) + }), + previous, + parent, + )), + // TODO: propagate group increment + RegexExpr::Concat(list) => { + let mut concat_node = ExprNode::new_leaf(Expr::Concat(vec![]), previous, parent); + + let no_siblings_list = list.iter() + .filter_map(|e| to_nested_expr(e, config, group_increment, Some(concat_node), None)) + .collect::>(); + + let nodes = no_siblings_list.iter() + .enumerate() + .map(|(i, mut e)| { + let previous = if i == 0 { + concat_node + } else { + no_siblings_list[i] + }; + + e.previous = Some(previous.into()); + + *e + }) + .collect::>(); + + if nodes.is_empty() { + return None; + } + + concat_node.current = Expr::Concat(nodes); + + Some(concat_node) + }, + RegexExpr::Alt(list) => { + let mut alt_expr_node = ExprNode::new_leaf(Expr::Alt(vec![]), previous, parent); + + let list = list.iter() + .filter_map(|e| to_nested_expr(e, config, group_increment, Some(alt_expr_node), Some(alt_expr_node))) + .collect(); + + alt_expr_node.current = Expr::Alt(list); + + Some(alt_expr_node) + }, + RegexExpr::Group(e) => { + let mut group = ExprNode::new_leaf( + Expr::Group(Box::new(ExprNode::dummy()), group_increment.into()), + previous, + parent, + ); + + let group_nest: Option = to_nested_expr( + e, + config, + group_increment + .checked_add(1) + .expect("group increment overflow"), + Some(group), + Some(group), + ); + + if group_nest.is_none() { + return None; + } + + group.current = Expr::Group(Box::new(group_nest.unwrap()), group_increment.into()); + + Some(group) + } RegexExpr::LookAround(e, la) => { - to_expr(e, config, group_increment).map(|e| Expr::LookAround(Box::new(e), *la)) + to_nested_expr(e, config, group_increment).map(|e| Expr::LookAround(Box::new(e), *la)) } RegexExpr::Repeat { child, @@ -194,34 +320,36 @@ pub fn to_expr( } } // Delegates essentially forcibly match some string, so we can turn them into a token - RegexExpr::Delegate { inner, casei, .. } => Some(Expr::Token(if *casei { - Token::new_ignore_case(inner) + RegexExpr::Delegate { inner, casei, .. } => Some(ExprNode::new_leaf(Expr::Token(if *casei { + Token::new_ignore_case(inner) } else { - Token::new(inner) - })), + Token::new(inner) + }), previous, parent)), // note that since we convert backrefs to tokens, the complexity of a vulnerability // may underestimate the actual complexity, though this will not cause // false negatives RegexExpr::Backref(_) => unimplemented!("Backrefs are not supported yet."), RegexExpr::AtomicGroup(e) => { - to_expr(e, config, group_increment).map(|e| Expr::AtomicGroup(Box::new(e))) + to_nested_expr(e, config, group_increment).map(|e| Expr::AtomicGroup(Box::new(e))) + } + RegexExpr::KeepOut => unimplemented!("Keep out not supported."), + RegexExpr::ContinueFromPreviousMatchEnd => { + unimplemented!("Continue from previous match end not supported.") } - RegexExpr::KeepOut => None, - RegexExpr::ContinueFromPreviousMatchEnd => None, - RegexExpr::BackrefExistsCondition(_) => None, + RegexExpr::BackrefExistsCondition(_) => unimplemented!("Backref conditions not supported"), RegexExpr::Conditional { condition, true_branch, false_branch, } => { - let true_branch = to_expr(true_branch, config, group_increment); - let false_branch = to_expr(false_branch, config, group_increment); + let true_branch = to_nested_expr(true_branch, config, group_increment); + let false_branch = to_nested_expr(false_branch, config, group_increment); if let (Some(true_branch), Some(false_branch)) = (true_branch, false_branch) { let condition: Option = match condition.as_ref() { &RegexExpr::BackrefExistsCondition(number) => { Some(ExprConditional::BackrefExistsCondition(number)) } - expr => to_expr(expr, config, group_increment) + expr => to_nested_expr(expr, config, group_increment) .map(|x| ExprConditional::Condition(Box::new(x))), }; diff --git a/crates/redos/src/lib.rs b/crates/redos/src/lib.rs index 5092635..784925a 100644 --- a/crates/redos/src/lib.rs +++ b/crates/redos/src/lib.rs @@ -19,7 +19,7 @@ impl RegexInfo { fn merge(self, other: RegexInfo) -> RegexInfo { RegexInfo { has_repeat: self.has_repeat || other.has_repeat, - has_alternation: self.has_alternation || other.has_alternation + has_alternation: self.has_alternation || other.has_alternation, } } @@ -85,7 +85,7 @@ fn regex_pre_scan(expr: &Expr) -> RegexInfo { } ExprConditional::Condition(condition) => regex_pre_scan(condition.as_ref()) .merge(regex_pre_scan_nested(true_branch.as_ref())) - .merge(regex_pre_scan_nested(false_branch.as_ref())) + .merge(regex_pre_scan_nested(false_branch.as_ref())), } } } diff --git a/crates/redos/src/nq.rs b/crates/redos/src/nq.rs index 31ec839..8efd84e 100644 --- a/crates/redos/src/nq.rs +++ b/crates/redos/src/nq.rs @@ -39,7 +39,5 @@ pub fn scan_nq(expr: &Expr) -> NqReturn { } fn scan_concat(exprs: &Vec) -> Expr { - for expr in exprs { - - } + for expr in exprs {} }