|
| 1 | +import bs4 |
1 | 2 | import feedparser
|
2 | 3 | import regex as re
|
3 | 4 |
|
|
8 | 9 | # feedparser 6.x
|
9 | 10 | from feedparser.datetimes import _parse_date
|
10 | 11 |
|
11 |
| -import bs4 |
12 |
| - |
13 | 12 | from threatingestor.sources import Source
|
14 | 13 |
|
15 | 14 | class Plugin(Source):
|
16 | 15 |
|
17 |
| - def __init__(self, name, url, feed_type, filter=None): |
| 16 | + def __init__(self, name, url, feed_type, include=None, exclude=None): |
18 | 17 | self.name = name
|
19 | 18 | self.url = url
|
20 | 19 | self.feed_type = feed_type
|
21 |
| - self.filter = filter |
| 20 | + self.include = include |
| 21 | + self.exclude = exclude |
22 | 22 |
|
23 | 23 | def run(self, saved_state):
|
24 | 24 | feed = feedparser.parse(self.url)
|
@@ -48,34 +48,45 @@ def run(self, saved_state):
|
48 | 48 | [x.unwrap() for x in soup.find_all('i')]
|
49 | 49 | soup = bs4.BeautifulSoup(soup.decode(), 'html.parser')
|
50 | 50 |
|
51 |
| - text = '' |
52 |
| - |
53 |
| - if self.filter is not None: |
54 |
| - |
55 |
| - rss_query = re.compile(r"{0}".format(self.filter)).findall(str(self.filter.split('|'))) |
56 |
| - |
57 |
| - for r in rss_query: |
58 |
| - if self.feed_type == 'afterioc': |
59 |
| - text = soup.get_text(separator=' ').split('Indicators of Compromise')[-1] |
60 |
| - |
61 |
| - if r in text: |
| 51 | + text = "" |
| 52 | + |
| 53 | + if self.exclude is not None: |
| 54 | + rss_exclude = re.sub(re.compile(fr"{self.exclude}", re.IGNORECASE), "", str(item.get('link'))) |
| 55 | + |
| 56 | + if rss_exclude: |
| 57 | + if "http" in rss_exclude: |
| 58 | + if self.feed_type == "afterioc": |
| 59 | + text = soup.get_text(separator=' ').split('Indicators of Compromise')[-1] |
| 60 | + artifacts += self.process_element(text, item.get('link'), include_nonobfuscated=True) |
| 61 | + elif self.feed_type == "clean": |
| 62 | + text = soup.get_text(separator=' ') |
| 63 | + artifacts += self.process_element(text, item.get('link'), include_nonobfuscated=True) |
| 64 | + else: |
| 65 | + # Default: self.feed_type == 'messy'. |
| 66 | + text = soup.get_text(separator=' ') |
| 67 | + artifacts += self.process_element(text, item.get('link')) |
| 68 | + |
| 69 | + if self.include is not None: |
| 70 | + rss_include = re.compile(r"{0}".format(self.include)).findall(str(self.include.split('|'))) |
| 71 | + |
| 72 | + for rss_f in rss_include: |
| 73 | + if rss_f in item.get('link'): |
| 74 | + if self.feed_type == "afterioc": |
| 75 | + text = soup.get_text(separator=' ').split('Indicators of Compromise')[-1] |
62 | 76 | artifacts += self.process_element(text, item.get('link') or self.url, include_nonobfuscated=True)
|
63 |
| - elif self.feed_type == 'clean': |
64 |
| - text = soup.get_text(separator=' ') |
65 |
| - |
66 |
| - if r in text: |
| 77 | + elif self.feed_type == "clean": |
| 78 | + text = soup.get_text(separator=' ') |
67 | 79 | artifacts += self.process_element(text, item.get('link') or self.url, include_nonobfuscated=True)
|
68 |
| - else: |
69 |
| - # Default: self.feed_type == 'messy'. |
70 |
| - text = soup.get_text(separator=' ') |
71 |
| - artifacts += self.process_element(text, item.get('link') or self.url) |
72 |
| - |
73 |
| - else: |
| 80 | + else: |
| 81 | + # Default: self.feed_type == 'messy'. |
| 82 | + text = soup.get_text(separator=' ') |
| 83 | + artifacts += self.process_element(text, item.get('link') or self.url) |
74 | 84 |
|
75 |
| - if self.feed_type == 'afterioc': |
| 85 | + if self.include is None and self.exclude is None: |
| 86 | + if self.feed_type == "afterioc": |
76 | 87 | text = soup.get_text(separator=' ').split('Indicators of Compromise')[-1]
|
77 | 88 | artifacts += self.process_element(text, item.get('link') or self.url, include_nonobfuscated=True)
|
78 |
| - elif self.feed_type == 'clean': |
| 89 | + elif self.feed_type == "clean": |
79 | 90 | text = soup.get_text(separator=' ')
|
80 | 91 | artifacts += self.process_element(text, item.get('link') or self.url, include_nonobfuscated=True)
|
81 | 92 | else:
|
|
0 commit comments