Add exclusion to sitemap source

battleoverflow · battleoverflow · commit 6ffff4694cfd · 2023-10-13T11:48:10.000-05:00
diff --git a/threatingestor/sources/sitemap.py b/threatingestor/sources/sitemap.py
@@ -8,10 +8,11 @@
 
 class Plugin(Source):
 
-    def __init__(self, name, url, filter=None, path=None):
+    def __init__(self, name, url, include=None, exclude=None, path=None):
         self.name = name
         self.url = url
-        self.filter = filter
+        self.include = include
+        self.exclude = exclude
         self.path = path
 
     def run(self, saved_state):
@@ -47,13 +48,13 @@ def run(self, saved_state):
             [x.unwrap() for x in soup.find_all('i')]
             soup = BeautifulSoup(soup.decode(), 'html.parser')
 
-            if self.filter is not None:
+            if self.exclude is not None:
                 # Regex input via config.yml
                 # Example: security|threat|malware
-                xml_query = re.compile(r"{0}".format(self.filter)).findall(str(self.filter.split('|')))
+                xml_exclude = re.compile(r"{0}".format(self.exclude)).findall(str(self.exclude.split('|')))
 
                 # Iterates over the regex output to locate all provided keywords
-                for x in xml_query:
+                for xe in xml_exclude:
                     # Uses a path instead of a keyword
                     if self.path is not None:
                         if self.path in loc:
@@ -62,19 +63,38 @@ def run(self, saved_state):
 
                     # Only filters using a keyword
                     if self.path is None:
-                        if x in loc:
+                        if xe not in loc:
                             text = soup.get_text(separator=' ').split('Indicators of Compromise')[-1]
                             artifacts += self.process_element(content=text, reference_link=str(loc), include_nonobfuscated=True)
 
-            elif self.filter is None and self.path is not None:
-                # Filters only by path in XML loc, no set filter
+            if self.include is not None:
+                # Regex input via config.yml
+                # Example: security|threat|malware
+                xml_include = re.compile(r"{0}".format(self.include)).findall(str(self.include.split('|')))
+
+                # Iterates over the regex output to locate all provided keywords
+                for xi in xml_include:
+                    # Uses a path instead of a keyword
+                    if self.path is not None:
+                        if self.path in loc:
+                            text = soup.get_text(separator=' ').split('Indicators of Compromise')[-1]
+                            artifacts += self.process_element(content=text, reference_link=str(loc), include_nonobfuscated=True)
+
+                    # Only filters using a keyword
+                    if self.path is None:
+                        if xi in loc:
+                            text = soup.get_text(separator=' ').split('Indicators of Compromise')[-1]
+                            artifacts += self.process_element(content=text, reference_link=str(loc), include_nonobfuscated=True)
+
+            if self.include is None or self.exclude is None and self.path is not None:
+                # Filters only by path in XML loc, no set include
                 # Default: /path/name/*
 
                 if self.path in loc:
                     text = soup.get_text(separator=' ').split('Indicators of Compromise')[-1]
                     artifacts += self.process_element(content=text, reference_link=str(loc), include_nonobfuscated=True)
             
-            else:
+            if self.include is None and self.path is None and self.exclude is None:
                 # Locates all blog links within the sitemap
                 if "blog" in loc:
                     text = soup.get_text(separator=' ').split('Indicators of Compromise')[-1]