Now accepts raw regex for sitemap and rss exclusion

battleoverflow · battleoverflow · commit 8c8481bd66c3 · 2023-11-01T11:42:15.000-05:00
diff --git a/config.example.yml b/config.example.yml
@@ -94,7 +94,7 @@ sources:
     module: rss
     url: https://inquest.net/blog/rss
     feed_type: messy
-    exclude: security|threat|research
+    exclude: https:\/.inquest\.net\/blog[\/]?inquest-[\/]?
 
   # Sitemap exmaples
 
@@ -108,7 +108,7 @@ sources:
   - name: inquest-sitemap-articles
     module: sitemap
     url: https://www.inquest.net/sitemap.xml
-    exclude: security|threat|research
+    exclude: https:\/.inquest\.net\/blog[\/]?inquest-[\/]?
 
   # Defaults to "blog" keyword
   - name: inquest-sitemap-blog
diff --git a/docs/sources/rss.rst b/docs/sources/rss.rst
@@ -21,7 +21,8 @@ Configuration Options
 * ``module`` (required): ``rss``
 * ``url`` (required): URL to the RSS or Atom feed.
 * ``feed_type`` (required): see above; if unsure, use ``messy``.
-* ``filter`` (optional): Regex filtering for RSS feed.
+* ``include`` (optional): Include filter using simplified regex.
+* ``exclude`` (optional): Exclude filter using raw regex.
 
 Example Configuration
 ~~~~~~~~~~~~~~~~~~~~~
@@ -34,7 +35,8 @@ Inside the ``sources`` section of your configuration file:
       module: rss
       url: https://example.com/rss.xml
       feed_type: messy
-      filter: security|threat
+      include: security|threat
+      exclude: https:\/.inquest\.net\/blog[\/]?inquest-[\/]?
 
 .. _sqs-source:
 
diff --git a/docs/sources/sitemap.rst b/docs/sources/sitemap.rst
@@ -10,6 +10,8 @@ Configuration Options
 
 * ``module`` (required): ``sitemap``
 * ``url`` (required): URL of the website with the sitemap path.
+* ``include`` (optional): Include filter using simplified regex.
+* ``exclude`` (optional): Exclude filter using raw regex.
 
 Example Configuration
 ~~~~~~~~~~~~~~~~~~~~~
@@ -21,3 +23,5 @@ Quick setup for sitemap parsing:
     - name: inquest-blog
       module: sitemap
       url: https://inquest.net/sitemap.xml
+      include: security|threat|research
+      exclude: https:\/.inquest\.net\/blog[\/]?inquest-[\/]?
diff --git a/threatingestor/sources/rss.py b/threatingestor/sources/rss.py
@@ -51,20 +51,19 @@ def run(self, saved_state):
             text = ""
 
             if self.exclude is not None:
-                rss_exclude = re.compile(r"{0}".format(self.exclude)).findall(str(self.exclude.split('|')))
-
-                for rss_e in rss_exclude:
-                    if rss_e not in item.get('link'):
-                        if self.feed_type == "afterioc":
-                            text = soup.get_text(separator=' ').split('Indicators of Compromise')[-1]
-                            artifacts += self.process_element(text, item.get('link'), include_nonobfuscated=True)
-                        elif self.feed_type == "clean":
-                            text = soup.get_text(separator=' ')
-                            artifacts += self.process_element(text, item.get('link'), include_nonobfuscated=True)
-                        else:
-                            # Default: self.feed_type == 'messy'.
-                            text = soup.get_text(separator=' ')
-                            artifacts += self.process_element(text, item.get('link'))
+                rss_exclude = re.sub(re.compile(fr"{self.exclude}", re.IGNORECASE), "", str(item.get('link')))
+
+                if rss_exclude:
+                    if self.feed_type == "afterioc":
+                        text = soup.get_text(separator=' ').split('Indicators of Compromise')[-1]
+                        artifacts += self.process_element(text, item.get('link'), include_nonobfuscated=True)
+                    elif self.feed_type == "clean":
+                        text = soup.get_text(separator=' ')
+                        artifacts += self.process_element(text, item.get('link'), include_nonobfuscated=True)
+                    else:
+                        # Default: self.feed_type == 'messy'.
+                        text = soup.get_text(separator=' ')
+                        artifacts += self.process_element(text, item.get('link'))
 
             if self.include is not None:
                 rss_include = re.compile(r"{0}".format(self.include)).findall(str(self.include.split('|')))
diff --git a/threatingestor/sources/sitemap.py b/threatingestor/sources/sitemap.py
@@ -50,20 +50,16 @@ def run(self, saved_state):
 
             if self.exclude is not None:
                 # Regex input via config.yml
-                # Example: security|threat|malware
-                xml_exclude = re.compile(r"{0}".format(self.exclude)).findall(str(self.exclude.split('|')))
+                xml_exclude = re.sub(re.compile(fr"{self.exclude}", re.IGNORECASE), "", str(loc))
+
+                if xml_exclude:
+                    if self.path is None and "http" in xml_exclude:
+                        text = soup.get_text(separator=' ').split('Indicators of Compromise')[-1]
+                        artifacts += self.process_element(content=text, reference_link=str(loc), include_nonobfuscated=True)
 
-                # Iterates over the regex output to locate all provided keywords
-                for xe in xml_exclude:
                     # Uses a path instead of a keyword
                     if self.path is not None:
-                        if self.path in loc:
-                            text = soup.get_text(separator=' ').split('Indicators of Compromise')[-1]
-                            artifacts += self.process_element(content=text, reference_link=str(loc), include_nonobfuscated=True)
-
-                    # Only filters using a keyword
-                    if self.path is None:
-                        if xe not in loc:
+                        if self.path in xml_exclude:
                             text = soup.get_text(separator=' ').split('Indicators of Compromise')[-1]
                             artifacts += self.process_element(content=text, reference_link=str(loc), include_nonobfuscated=True)
 
@@ -86,15 +82,15 @@ def run(self, saved_state):
                             text = soup.get_text(separator=' ').split('Indicators of Compromise')[-1]
                             artifacts += self.process_element(content=text, reference_link=str(loc), include_nonobfuscated=True)
 
-            if self.include is None or self.exclude is None and self.path is not None:
+            if self.include is None and self.exclude is None and self.path is not None:
                 # Filters only by path in XML loc, no set include
                 # Default: /path/name/*
 
                 if self.path in loc:
                     text = soup.get_text(separator=' ').split('Indicators of Compromise')[-1]
                     artifacts += self.process_element(content=text, reference_link=str(loc), include_nonobfuscated=True)
             
-            if self.include is None and self.path is None and self.exclude is None:
+            if self.include is None and self.exclude is None and self.path is None:
                 # Locates all blog links within the sitemap
                 if "blog" in loc:
                     text = soup.get_text(separator=' ').split('Indicators of Compromise')[-1]