Skip to content

Commit 6ffff46

Browse files
Add exclusion to sitemap source
1 parent b387488 commit 6ffff46

File tree

1 file changed

+29
-9
lines changed

1 file changed

+29
-9
lines changed

threatingestor/sources/sitemap.py

+29-9
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,11 @@
88

99
class Plugin(Source):
1010

11-
def __init__(self, name, url, filter=None, path=None):
11+
def __init__(self, name, url, include=None, exclude=None, path=None):
1212
self.name = name
1313
self.url = url
14-
self.filter = filter
14+
self.include = include
15+
self.exclude = exclude
1516
self.path = path
1617

1718
def run(self, saved_state):
@@ -47,13 +48,13 @@ def run(self, saved_state):
4748
[x.unwrap() for x in soup.find_all('i')]
4849
soup = BeautifulSoup(soup.decode(), 'html.parser')
4950

50-
if self.filter is not None:
51+
if self.exclude is not None:
5152
# Regex input via config.yml
5253
# Example: security|threat|malware
53-
xml_query = re.compile(r"{0}".format(self.filter)).findall(str(self.filter.split('|')))
54+
xml_exclude = re.compile(r"{0}".format(self.exclude)).findall(str(self.exclude.split('|')))
5455

5556
# Iterates over the regex output to locate all provided keywords
56-
for x in xml_query:
57+
for xe in xml_exclude:
5758
# Uses a path instead of a keyword
5859
if self.path is not None:
5960
if self.path in loc:
@@ -62,19 +63,38 @@ def run(self, saved_state):
6263

6364
# Only filters using a keyword
6465
if self.path is None:
65-
if x in loc:
66+
if xe not in loc:
6667
text = soup.get_text(separator=' ').split('Indicators of Compromise')[-1]
6768
artifacts += self.process_element(content=text, reference_link=str(loc), include_nonobfuscated=True)
6869

69-
elif self.filter is None and self.path is not None:
70-
# Filters only by path in XML loc, no set filter
70+
if self.include is not None:
71+
# Regex input via config.yml
72+
# Example: security|threat|malware
73+
xml_include = re.compile(r"{0}".format(self.include)).findall(str(self.include.split('|')))
74+
75+
# Iterates over the regex output to locate all provided keywords
76+
for xi in xml_include:
77+
# Uses a path instead of a keyword
78+
if self.path is not None:
79+
if self.path in loc:
80+
text = soup.get_text(separator=' ').split('Indicators of Compromise')[-1]
81+
artifacts += self.process_element(content=text, reference_link=str(loc), include_nonobfuscated=True)
82+
83+
# Only filters using a keyword
84+
if self.path is None:
85+
if xi in loc:
86+
text = soup.get_text(separator=' ').split('Indicators of Compromise')[-1]
87+
artifacts += self.process_element(content=text, reference_link=str(loc), include_nonobfuscated=True)
88+
89+
if self.include is None or self.exclude is None and self.path is not None:
90+
# Filters only by path in XML loc, no set include
7191
# Default: /path/name/*
7292

7393
if self.path in loc:
7494
text = soup.get_text(separator=' ').split('Indicators of Compromise')[-1]
7595
artifacts += self.process_element(content=text, reference_link=str(loc), include_nonobfuscated=True)
7696

77-
else:
97+
if self.include is None and self.path is None and self.exclude is None:
7898
# Locates all blog links within the sitemap
7999
if "blog" in loc:
80100
text = soup.get_text(separator=' ').split('Indicators of Compromise')[-1]

0 commit comments

Comments
 (0)