Skip to content

Commit 8c8481b

Browse files
Now accepts raw regex for sitemap and rss exclusion
1 parent 6ffff46 commit 8c8481b

File tree

5 files changed

+32
-31
lines changed

5 files changed

+32
-31
lines changed

config.example.yml

+2-2
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ sources:
9494
module: rss
9595
url: https://inquest.net/blog/rss
9696
feed_type: messy
97-
exclude: security|threat|research
97+
exclude: https:\/.inquest\.net\/blog[\/]?inquest-[\/]?
9898

9999
# Sitemap exmaples
100100

@@ -108,7 +108,7 @@ sources:
108108
- name: inquest-sitemap-articles
109109
module: sitemap
110110
url: https://www.inquest.net/sitemap.xml
111-
exclude: security|threat|research
111+
exclude: https:\/.inquest\.net\/blog[\/]?inquest-[\/]?
112112

113113
# Defaults to "blog" keyword
114114
- name: inquest-sitemap-blog

docs/sources/rss.rst

+4-2
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,8 @@ Configuration Options
2121
* ``module`` (required): ``rss``
2222
* ``url`` (required): URL to the RSS or Atom feed.
2323
* ``feed_type`` (required): see above; if unsure, use ``messy``.
24-
* ``filter`` (optional): Regex filtering for RSS feed.
24+
* ``include`` (optional): Include filter using simplified regex.
25+
* ``exclude`` (optional): Exclude filter using raw regex.
2526

2627
Example Configuration
2728
~~~~~~~~~~~~~~~~~~~~~
@@ -34,7 +35,8 @@ Inside the ``sources`` section of your configuration file:
3435
module: rss
3536
url: https://example.com/rss.xml
3637
feed_type: messy
37-
filter: security|threat
38+
include: security|threat
39+
exclude: https:\/.inquest\.net\/blog[\/]?inquest-[\/]?
3840
3941
.. _sqs-source:
4042

docs/sources/sitemap.rst

+4
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@ Configuration Options
1010

1111
* ``module`` (required): ``sitemap``
1212
* ``url`` (required): URL of the website with the sitemap path.
13+
* ``include`` (optional): Include filter using simplified regex.
14+
* ``exclude`` (optional): Exclude filter using raw regex.
1315

1416
Example Configuration
1517
~~~~~~~~~~~~~~~~~~~~~
@@ -21,3 +23,5 @@ Quick setup for sitemap parsing:
2123
- name: inquest-blog
2224
module: sitemap
2325
url: https://inquest.net/sitemap.xml
26+
include: security|threat|research
27+
exclude: https:\/.inquest\.net\/blog[\/]?inquest-[\/]?

threatingestor/sources/rss.py

+13-14
Original file line numberDiff line numberDiff line change
@@ -51,20 +51,19 @@ def run(self, saved_state):
5151
text = ""
5252

5353
if self.exclude is not None:
54-
rss_exclude = re.compile(r"{0}".format(self.exclude)).findall(str(self.exclude.split('|')))
55-
56-
for rss_e in rss_exclude:
57-
if rss_e not in item.get('link'):
58-
if self.feed_type == "afterioc":
59-
text = soup.get_text(separator=' ').split('Indicators of Compromise')[-1]
60-
artifacts += self.process_element(text, item.get('link'), include_nonobfuscated=True)
61-
elif self.feed_type == "clean":
62-
text = soup.get_text(separator=' ')
63-
artifacts += self.process_element(text, item.get('link'), include_nonobfuscated=True)
64-
else:
65-
# Default: self.feed_type == 'messy'.
66-
text = soup.get_text(separator=' ')
67-
artifacts += self.process_element(text, item.get('link'))
54+
rss_exclude = re.sub(re.compile(fr"{self.exclude}", re.IGNORECASE), "", str(item.get('link')))
55+
56+
if rss_exclude:
57+
if self.feed_type == "afterioc":
58+
text = soup.get_text(separator=' ').split('Indicators of Compromise')[-1]
59+
artifacts += self.process_element(text, item.get('link'), include_nonobfuscated=True)
60+
elif self.feed_type == "clean":
61+
text = soup.get_text(separator=' ')
62+
artifacts += self.process_element(text, item.get('link'), include_nonobfuscated=True)
63+
else:
64+
# Default: self.feed_type == 'messy'.
65+
text = soup.get_text(separator=' ')
66+
artifacts += self.process_element(text, item.get('link'))
6867

6968
if self.include is not None:
7069
rss_include = re.compile(r"{0}".format(self.include)).findall(str(self.include.split('|')))

threatingestor/sources/sitemap.py

+9-13
Original file line numberDiff line numberDiff line change
@@ -50,20 +50,16 @@ def run(self, saved_state):
5050

5151
if self.exclude is not None:
5252
# Regex input via config.yml
53-
# Example: security|threat|malware
54-
xml_exclude = re.compile(r"{0}".format(self.exclude)).findall(str(self.exclude.split('|')))
53+
xml_exclude = re.sub(re.compile(fr"{self.exclude}", re.IGNORECASE), "", str(loc))
54+
55+
if xml_exclude:
56+
if self.path is None and "http" in xml_exclude:
57+
text = soup.get_text(separator=' ').split('Indicators of Compromise')[-1]
58+
artifacts += self.process_element(content=text, reference_link=str(loc), include_nonobfuscated=True)
5559

56-
# Iterates over the regex output to locate all provided keywords
57-
for xe in xml_exclude:
5860
# Uses a path instead of a keyword
5961
if self.path is not None:
60-
if self.path in loc:
61-
text = soup.get_text(separator=' ').split('Indicators of Compromise')[-1]
62-
artifacts += self.process_element(content=text, reference_link=str(loc), include_nonobfuscated=True)
63-
64-
# Only filters using a keyword
65-
if self.path is None:
66-
if xe not in loc:
62+
if self.path in xml_exclude:
6763
text = soup.get_text(separator=' ').split('Indicators of Compromise')[-1]
6864
artifacts += self.process_element(content=text, reference_link=str(loc), include_nonobfuscated=True)
6965

@@ -86,15 +82,15 @@ def run(self, saved_state):
8682
text = soup.get_text(separator=' ').split('Indicators of Compromise')[-1]
8783
artifacts += self.process_element(content=text, reference_link=str(loc), include_nonobfuscated=True)
8884

89-
if self.include is None or self.exclude is None and self.path is not None:
85+
if self.include is None and self.exclude is None and self.path is not None:
9086
# Filters only by path in XML loc, no set include
9187
# Default: /path/name/*
9288

9389
if self.path in loc:
9490
text = soup.get_text(separator=' ').split('Indicators of Compromise')[-1]
9591
artifacts += self.process_element(content=text, reference_link=str(loc), include_nonobfuscated=True)
9692

97-
if self.include is None and self.path is None and self.exclude is None:
93+
if self.include is None and self.exclude is None and self.path is None:
9894
# Locates all blog links within the sitemap
9995
if "blog" in loc:
10096
text = soup.get_text(separator=' ').split('Indicators of Compromise')[-1]

0 commit comments

Comments
 (0)