Skip to content

Commit

Permalink
fix: use Python's string notation for regular expression patterns
Browse files Browse the repository at this point in the history
Use r'...' string notation to avoid SyntaxWarnings because of unescaped
backslashes.
  • Loading branch information
sebastian-nagel committed Oct 23, 2024
1 parent 3645123 commit ae9e3d1
Show file tree
Hide file tree
Showing 4 changed files with 15 additions and 15 deletions.
10 changes: 5 additions & 5 deletions crawlstats.py
Original file line number Diff line number Diff line change
Expand Up @@ -463,7 +463,7 @@ class HostDomainCount:
For each item both total pages and unique URLs are counted.
"""

IPpattern = re.compile('^\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3}$')
IPpattern = re.compile(r'^\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3}$')

def __init__(self):
self.hosts = MultiCount(2)
Expand Down Expand Up @@ -510,7 +510,7 @@ def output(self, crawl):
class SurtDomainCount:
"""Counters for one single SURT prefix/domain."""

robots_txt_warc_pattern = re.compile('/robotstxt/')
robots_txt_warc_pattern = re.compile(r'/robotstxt/')

def __init__(self, surt_domain):
self.surt_domain = surt_domain
Expand Down Expand Up @@ -648,9 +648,9 @@ class CCStatsJob(MRJob):
'mapreduce.job.jvm.numtasks': '-1',
}

s3pattern = re.compile('^s3://([^/]+)/(.+)')
gzpattern = re.compile('\.gz$')
crawlpattern = re.compile('(CC-MAIN-2\d{3}-\d{2})')
s3pattern = re.compile(r'^s3://([^/]+)/(.+)')
gzpattern = re.compile(r'\.gz$')
crawlpattern = re.compile(r'(CC-MAIN-2\d{3}-\d{2})')

def configure_args(self):
"""Custom command line options for common crawl index statistics"""
Expand Down
8 changes: 4 additions & 4 deletions plot/crawl_size.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ def plot(self):
# -- cumulative size
row_types = ['page cumul.', 'url estim. cumul.',
'digest estim. cumul.']
self.size_plot(self.size_by_type, row_types, ' cumul\.$',
self.size_plot(self.size_by_type, row_types, r' cumul\.$',
'Crawl Size Cumulative',
'Pages / Unique Items Cumulative',
'crawlsize/cumulative.png',
Expand Down Expand Up @@ -191,7 +191,7 @@ def plot(self):
data = self.size_by_type
data = data[data['type'].isin(row_types)]
data.replace(to_replace='url', value='1 crawl', inplace=True)
self.size_plot(data, row_types, '^url estim\. cumul\. last | crawls?$',
self.size_plot(data, row_types, r'^url estim\. cumul\. last | crawls?$',
'URLs Cumulative Over Last N Crawls',
'Unique URLs cumulative',
'crawlsize/url_last_n_crawls.png',
Expand All @@ -207,7 +207,7 @@ def plot(self):
data = self.size_by_type
data = data[data['type'].isin(row_types)]
data.replace(to_replace='url', value='1 crawl', inplace=True)
self.size_plot(data, row_types, '^URLs/pages last | crawls?$',
self.size_plot(data, row_types, r'^URLs/pages last | crawls?$',
'Ratio Unique URLs / Total Pages Captured Over Last N Crawls',
'URLs/Pages',
'crawlsize/url_page_ratio_last_n_crawls.png',
Expand All @@ -223,7 +223,7 @@ def plot(self):
data = data[data['type'].isin(row_types)]
data.replace(to_replace='digest estim.', value='1 crawl', inplace=True)
self.size_plot(data, row_types,
'^digest estim\. cumul\. last | crawls?$',
r'^digest estim\. cumul\. last | crawls?$',
'Content Digest Cumulative Over Last N Crawls',
'Unique content digests cumulative',
'crawlsize/digest_last_n_crawls.png',
Expand Down
10 changes: 5 additions & 5 deletions plot/mimetype.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,12 @@ class MimeTypeStats(TabularStats):

# see https://en.wikipedia.org/wiki/Media_type#Naming
mime_pattern_str = \
'(?:x-)?[a-z]+/[a-z0-9]+' \
'(?:[.-](?:c\+\+[a-z]*|[a-z0-9]+))*(?:\+[a-z0-9]+)?'
mime_pattern = re.compile('^'+mime_pattern_str+'$')
mime_extract_pattern = re.compile('^\s*(?:content\s*=\s*)?["\']?\s*(' +
r'(?:x-)?[a-z]+/[a-z0-9]+' \
r'(?:[.-](?:c\+\+[a-z]*|[a-z0-9]+))*(?:\+[a-z0-9]+)?'
mime_pattern = re.compile(r'^'+mime_pattern_str+r'$')
mime_extract_pattern = re.compile(r'^\s*(?:content\s*=\s*)?["\']?\s*(' +
mime_pattern_str +
')(?:\s*[;,].*)?\s*["\']?\s*$')
r')(?:\s*[;,].*)?\s*["\']?\s*$')

def __init__(self):
super().__init__()
Expand Down
2 changes: 1 addition & 1 deletion top_level_domain.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def __init__(self, tld):
if tld in TopLevelDomain.tld_ccs:
self.first_level = TopLevelDomain.tld_ccs[tld]
elif tld.find('.'):
self.first_level = re.sub('^.+\.', '', tld)
self.first_level = re.sub(r'^.+\.', '', tld)
if tld in TopLevelDomain.tld_types:
self.tld_type = TopLevelDomain.tld_types[tld]
elif tld in TopLevelDomain.tld_ccs:
Expand Down

0 comments on commit ae9e3d1

Please sign in to comment.