Skip to content

Commit

Permalink
Adding YouTubeScraper.get_channel_links
Browse files Browse the repository at this point in the history
  • Loading branch information
Yomguithereal committed Dec 7, 2023
1 parent cb8e649 commit 5d7a444
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 5 deletions.
9 changes: 7 additions & 2 deletions ftest/youtube_scraping.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
from minet.youtube.scrapers import scrape_channel_id
from minet.youtube.scraper import YouTubeScraper

print(scrape_channel_id("https://www.youtube.com/@MonsieurPhi"))
scraper = YouTubeScraper()

links = scraper.get_channel_links(
"https://www.youtube.com/channel/UCHGFbA0KWBgf6gMbyUCZeCQ"
)
print(links)
42 changes: 41 additions & 1 deletion minet/youtube/scraper.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
from typing import List, Tuple, Optional
from typing import List, Set, Tuple, Optional

import re
import json
from html import unescape
from urllib.parse import unquote
from ural import infer_redirection

from minet.scrape import WonderfulSoup
from minet.web import (
Expand All @@ -20,6 +21,30 @@


CAPTION_TRACKS_RE = re.compile(r'"captionTracks":(\[.*?\])')
INITIAL_DATA_RE = re.compile(
rb"(?:const|let|var)\s+ytInitialData\s*=\s*({.+});</script>"
)


def gather_url_endpoints(data):
if isinstance(data, dict):
for k, v in data.items():
if k == "urlEndpoint":
if not isinstance(v, dict):
return

yield infer_redirection(v["url"])

return

yield from gather_url_endpoints(v)

elif isinstance(data, list):
for v in data:
yield from gather_url_endpoints(v)

else:
return


def select_caption_track(
Expand Down Expand Up @@ -126,3 +151,18 @@ def get_channel_id(self, channel_url: str) -> Optional[str]:
return tag.get("content")

return None

def get_channel_links(self, channel_url: str) -> Optional[Set[str]]:
response = self.request(channel_url, spoof_ua=True)

match = INITIAL_DATA_RE.search(response.body)

if match is None:
return None

try:
data = json.loads(match.group(1))
except json.JSONDecodeError:
return None

return set(gather_url_endpoints(data))
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,5 +30,5 @@ tenacity==8.2.1
trafilatura==1.6.0
typing_extensions>=4.3; python_version < '3.11'
twitwi==0.18.2
ural==1.2.1
ural==1.2.2
urllib3==1.26.16
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
"tenacity>=8,<9",
"trafilatura>=1.6,<1.7",
"twitwi>=0.18.2,<0.19",
"ural>=1.2,<2",
"ural>=1.2.2,<2",
"urllib3>=1.26.16,<2",
],
extras_require={
Expand Down

0 comments on commit 5d7a444

Please sign in to comment.