Adding YouTubeScraper.get_channel_links

medialab · Dec 7, 2023 · 5d7a444 · 5d7a444
1 parent cb8e649
commit 5d7a444
Show file tree

Hide file tree

Showing 4 changed files with 50 additions and 5 deletions.
diff --git a/ftest/youtube_scraping.py b/ftest/youtube_scraping.py
@@ -1,3 +1,8 @@
-from minet.youtube.scrapers import scrape_channel_id
+from minet.youtube.scraper import YouTubeScraper
 
-print(scrape_channel_id("https://www.youtube.com/@MonsieurPhi"))
+scraper = YouTubeScraper()
+
+links = scraper.get_channel_links(
+    "https://www.youtube.com/channel/UCHGFbA0KWBgf6gMbyUCZeCQ"
+)
+print(links)
diff --git a/minet/youtube/scraper.py b/minet/youtube/scraper.py
@@ -1,9 +1,10 @@
-from typing import List, Tuple, Optional
+from typing import List, Set, Tuple, Optional
 
 import re
 import json
 from html import unescape
 from urllib.parse import unquote
+from ural import infer_redirection
 
 from minet.scrape import WonderfulSoup
 from minet.web import (
@@ -20,6 +21,30 @@
 
 
 CAPTION_TRACKS_RE = re.compile(r'"captionTracks":(\[.*?\])')
+INITIAL_DATA_RE = re.compile(
+    rb"(?:const|let|var)\s+ytInitialData\s*=\s*({.+});</script>"
+)
+
+
+def gather_url_endpoints(data):
+    if isinstance(data, dict):
+        for k, v in data.items():
+            if k == "urlEndpoint":
+                if not isinstance(v, dict):
+                    return
+
+                yield infer_redirection(v["url"])
+
+                return
+
+            yield from gather_url_endpoints(v)
+
+    elif isinstance(data, list):
+        for v in data:
+            yield from gather_url_endpoints(v)
+
+    else:
+        return
 
 
 def select_caption_track(
@@ -126,3 +151,18 @@ def get_channel_id(self, channel_url: str) -> Optional[str]:
             return tag.get("content")
 
         return None
+
+    def get_channel_links(self, channel_url: str) -> Optional[Set[str]]:
+        response = self.request(channel_url, spoof_ua=True)
+
+        match = INITIAL_DATA_RE.search(response.body)
+
+        if match is None:
+            return None
+
+        try:
+            data = json.loads(match.group(1))
+        except json.JSONDecodeError:
+            return None
+
+        return set(gather_url_endpoints(data))
diff --git a/requirements.txt b/requirements.txt
@@ -30,5 +30,5 @@ tenacity==8.2.1
 trafilatura==1.6.0
 typing_extensions>=4.3; python_version < '3.11'
 twitwi==0.18.2
-ural==1.2.1
+ural==1.2.2
 urllib3==1.26.16
diff --git a/setup.py b/setup.py
@@ -45,7 +45,7 @@
         "tenacity>=8,<9",
         "trafilatura>=1.6,<1.7",
         "twitwi>=0.18.2,<0.19",
-        "ural>=1.2,<2",
+        "ural>=1.2.2,<2",
         "urllib3>=1.26.16,<2",
     ],
     extras_require={