fix: use Python's string notation for regular expression patterns

Use r'...' string notation to avoid SyntaxWarnings because of unescaped backslashes.
commoncrawl · Oct 23, 2024 · ae9e3d1 · ae9e3d1
1 parent 3645123
commit ae9e3d1
Show file tree

Hide file tree

Showing 4 changed files with 15 additions and 15 deletions.
diff --git a/crawlstats.py b/crawlstats.py
@@ -463,7 +463,7 @@ class HostDomainCount:
     For each item both total pages and unique URLs are counted.
     """
 
-    IPpattern = re.compile('^\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3}$')
+    IPpattern = re.compile(r'^\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3}$')
 
     def __init__(self):
         self.hosts = MultiCount(2)
@@ -510,7 +510,7 @@ def output(self, crawl):
 class SurtDomainCount:
     """Counters for one single SURT prefix/domain."""
 
-    robots_txt_warc_pattern = re.compile('/robotstxt/')
+    robots_txt_warc_pattern = re.compile(r'/robotstxt/')
 
     def __init__(self, surt_domain):
         self.surt_domain = surt_domain
@@ -648,9 +648,9 @@ class CCStatsJob(MRJob):
         'mapreduce.job.jvm.numtasks': '-1',
     }
 
-    s3pattern = re.compile('^s3://([^/]+)/(.+)')
-    gzpattern = re.compile('\.gz$')
-    crawlpattern = re.compile('(CC-MAIN-2\d{3}-\d{2})')
+    s3pattern = re.compile(r'^s3://([^/]+)/(.+)')
+    gzpattern = re.compile(r'\.gz$')
+    crawlpattern = re.compile(r'(CC-MAIN-2\d{3}-\d{2})')
 
     def configure_args(self):
         """Custom command line options for common crawl index statistics"""

diff --git a/plot/crawl_size.py b/plot/crawl_size.py
@@ -162,7 +162,7 @@ def plot(self):
         # -- cumulative size
         row_types = ['page cumul.', 'url estim. cumul.',
                      'digest estim. cumul.']
-        self.size_plot(self.size_by_type, row_types, ' cumul\.$',
+        self.size_plot(self.size_by_type, row_types, r' cumul\.$',
                        'Crawl Size Cumulative',
                        'Pages / Unique Items Cumulative',
                        'crawlsize/cumulative.png',
@@ -191,7 +191,7 @@ def plot(self):
         data = self.size_by_type
         data = data[data['type'].isin(row_types)]
         data.replace(to_replace='url', value='1 crawl', inplace=True)
-        self.size_plot(data, row_types, '^url estim\. cumul\. last | crawls?$',
+        self.size_plot(data, row_types, r'^url estim\. cumul\. last | crawls?$',
                        'URLs Cumulative Over Last N Crawls',
                        'Unique URLs cumulative',
                        'crawlsize/url_last_n_crawls.png',
@@ -207,7 +207,7 @@ def plot(self):
         data = self.size_by_type
         data = data[data['type'].isin(row_types)]
         data.replace(to_replace='url', value='1 crawl', inplace=True)
-        self.size_plot(data, row_types, '^URLs/pages last | crawls?$',
+        self.size_plot(data, row_types, r'^URLs/pages last | crawls?$',
                        'Ratio Unique URLs / Total Pages Captured Over Last N Crawls',
                        'URLs/Pages',
                        'crawlsize/url_page_ratio_last_n_crawls.png',
@@ -223,7 +223,7 @@ def plot(self):
         data = data[data['type'].isin(row_types)]
         data.replace(to_replace='digest estim.', value='1 crawl', inplace=True)
         self.size_plot(data, row_types,
-                       '^digest estim\. cumul\. last | crawls?$',
+                       r'^digest estim\. cumul\. last | crawls?$',
                        'Content Digest Cumulative Over Last N Crawls',
                        'Unique content digests cumulative',
                        'crawlsize/digest_last_n_crawls.png',

diff --git a/plot/mimetype.py b/plot/mimetype.py
@@ -12,12 +12,12 @@ class MimeTypeStats(TabularStats):
 
     # see https://en.wikipedia.org/wiki/Media_type#Naming
     mime_pattern_str = \
-        '(?:x-)?[a-z]+/[a-z0-9]+' \
-        '(?:[.-](?:c\+\+[a-z]*|[a-z0-9]+))*(?:\+[a-z0-9]+)?'
-    mime_pattern = re.compile('^'+mime_pattern_str+'$')
-    mime_extract_pattern = re.compile('^\s*(?:content\s*=\s*)?["\']?\s*(' +
+        r'(?:x-)?[a-z]+/[a-z0-9]+' \
+        r'(?:[.-](?:c\+\+[a-z]*|[a-z0-9]+))*(?:\+[a-z0-9]+)?'
+    mime_pattern = re.compile(r'^'+mime_pattern_str+r'$')
+    mime_extract_pattern = re.compile(r'^\s*(?:content\s*=\s*)?["\']?\s*(' +
                                       mime_pattern_str +
-                                      ')(?:\s*[;,].*)?\s*["\']?\s*$')
+                                      r')(?:\s*[;,].*)?\s*["\']?\s*$')
 
     def __init__(self):
         super().__init__()

diff --git a/top_level_domain.py b/top_level_domain.py
@@ -29,7 +29,7 @@ def __init__(self, tld):
         if tld in TopLevelDomain.tld_ccs:
             self.first_level = TopLevelDomain.tld_ccs[tld]
         elif tld.find('.'):
-            self.first_level = re.sub('^.+\.', '', tld)
+            self.first_level = re.sub(r'^.+\.', '', tld)
         if tld in TopLevelDomain.tld_types:
             self.tld_type = TopLevelDomain.tld_types[tld]
         elif tld in TopLevelDomain.tld_ccs: