Skip to content

Commit

Permalink
^q^
Browse files Browse the repository at this point in the history
  • Loading branch information
KurtBestor committed Oct 4, 2023
1 parent 87239ca commit 85ed80f
Show file tree
Hide file tree
Showing 43 changed files with 1,241 additions and 1,605 deletions.
23 changes: 13 additions & 10 deletions src/extractor/_4chan_downloader.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
import downloader
from utils import Downloader, LazyUrl, clean_title, urljoin, get_ext
from utils import Downloader, File, clean_title, urljoin, get_ext
from ratelimit import limits, sleep_and_retry
import utils


class Image:
def __init__(self, url, ref, n):
self._url = url
self.url = LazyUrl(ref, self.get, self)
self.filename = '{:04}{}'.format(n, get_ext(url))

class File_4chan(File):
type = '4chan'
format = 'page:04;'

@sleep_and_retry
@limits(2, 1)
def get(self, _):
return self._url
def get(self):
return {}



Expand All @@ -30,8 +30,11 @@ def read(self):
soup = downloader.read_soup(self.url)
for div in soup.findAll('div', class_='fileText'):
href = urljoin(self.url, div.a['href'])
img = Image(href, self.url, len(self.urls))
self.urls.append(img.url)
d = {
'page': len(self.urls),
}
file = File_4chan({'url': href, 'referer': self.url, 'name': utils.format('4chan', d, get_ext(href))})
self.urls.append(file)

board = self.url.split('/')[3]
title = soup.find('span', class_='subject').text
Expand Down
139 changes: 74 additions & 65 deletions src/extractor/afreeca_downloader.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import downloader
from utils import Soup, Downloader, get_outdir, Session, LazyUrl, try_n, format_filename, get_print, cut_pair
from utils import Soup, Downloader, get_outdir, Session, LazyUrl, try_n, format_filename, cut_pair, File
import ree as re
from timee import sleep, time
import os
Expand All @@ -10,18 +10,6 @@
import json


class Video:

def __init__(self, stream, referer, id, title, url_thumb):
self.url = LazyUrl(referer, lambda x: stream, self)
self.id = id
self.title = title
self.filename = format_filename(title, id, '.mp4')
self.url_thumb = url_thumb
self.thumb = BytesIO()
downloader.download(url_thumb, buffer=self.thumb)


class LoginRequired(errors.LoginRequired):
def __init__(self, *args):
super().__init__(*args, method='browser', url='https://login.afreecatv.com/afreeca/login.php')
Expand All @@ -42,12 +30,15 @@ def fix_url(cls, url):
return url.rstrip(' /')

def read(self):
video = get_video(self.url, self.session, self.cw)
self.urls.append(video.url)
video = Video({'referer': self.url})
video.ready(self.cw)
self.urls.append(video)

self.setIcon(video.thumb)
thumb = BytesIO()
downloader.download(video['url_thumb'], buffer=thumb)
self.setIcon(thumb)

self.title = video.title
self.title = video['title']


@try_n(4)
Expand All @@ -61,53 +52,71 @@ def _get_stream(url_m3u8):
return stream


def get_video(url, session, cw):
print_ = get_print(cw)

html = downloader.read_html(url, session=session)
if "document.location.href='https://login." in html:
raise LoginRequired()
if len(html) < 2000:
alert = re.find(r'''alert\(['"](.+?)['"]\)''', html)
if alert:
raise LoginRequired(alert)
soup = Soup(html)

url_thumb = soup.find('meta', {'property': 'og:image'}).attrs['content']
print_('url_thumb: {}'.format(url_thumb))
class Video(File):
type = 'afreeca'

url_api = 'https://api.m.afreecatv.com/station/video/a/view'
vid = re.find(f'/player/([0-9]+)', url, err='no vid')
r = session.post(url_api, data={'nTitleNo': vid, 'nApiLevel': '10'}, headers={'Referer': url})
try:
s = cut_pair(r.text)
d = json.loads(s)
except Exception as e:
print_(r.text)
raise e
data = d['data']

title = data['full_title']

if data.get('adult_status') == 'notLogin':
raise LoginRequired(title)

urls_m3u8 = []
for file in data['files']:
file = file['quality_info'][0]['file']
urls_m3u8.append(file)
print_(f'urls_m3u8: {len(urls_m3u8)}')

streams = []
for url_m3u8 in urls_m3u8:
try:
stream = _get_stream(url_m3u8)
except Exception as e:
print(e)
continue #2193
streams.append(stream)
for stream in streams[1:]:
streams[0] += stream
stream = streams[0]
video = Video(stream, url, vid, title, url_thumb)
return video
def get(self):
url, session = self['referer'], self.session

html = downloader.read_html(url, session=session)
if "document.location.href='https://login." in html:
raise LoginRequired()
if len(html) < 2000:
alert = re.find(r'''alert\(['"](.+?)['"]\)''', html)
if alert:
raise LoginRequired(alert)
soup = Soup(html)

url_thumb = soup.find('meta', {'property': 'og:image'}).attrs['content']
print_('url_thumb: {}'.format(url_thumb))

vid = re.find(f'/player/([0-9]+)', url, err='no vid')
if f'{vid}/catch' in url: #6215
url_api = 'https://api.m.afreecatv.com/station/video/a/catchview'
r = session.post(url_api, data={'nPageNo': '1', 'nLimit': '10', 'nTitleNo': vid}, headers={'Referer': url})
try:
s = cut_pair(r.text)
d = json.loads(s)
except Exception as e:
print_(r.text)
raise e
data = d['data'][0]
else:
url_api = 'https://api.m.afreecatv.com/station/video/a/view'
r = session.post(url_api, data={'nTitleNo': vid, 'nApiLevel': '10', 'nPlaylistIdx': '0'}, headers={'Referer': url})
try:
s = cut_pair(r.text)
d = json.loads(s)
except Exception as e:
print_(r.text)
raise e
data = d['data']

title = data.get('full_title') or data['title']

if data.get('adult_status') == 'notLogin':
raise LoginRequired(title)

urls_m3u8 = []
for file in data['files']:
if file.get('quality_info'):
file = file['quality_info'][0]['file']
else:
file = file['file']
urls_m3u8.append(file)
self.cw.print_(f'urls_m3u8: {len(urls_m3u8)}')

streams = []
for url_m3u8 in urls_m3u8:
try:
stream = _get_stream(url_m3u8)
except Exception as e:
print(e)
continue #2193
streams.append(stream)
for stream in streams[1:]:
streams[0] += stream
stream = streams[0]

return {'url': stream, 'title': title, 'name': format_filename(title, id, '.mp4'), 'url_thumb': url_thumb}
72 changes: 38 additions & 34 deletions src/extractor/artstation_downloader.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,24 @@
#coding:utf8
import os
import json
import downloader
from error_printer import print_error
from translator import tr_
from utils import Downloader, Soup, get_print, lazy, Session, try_n, LazyUrl, clean_title, check_alive
from utils import Downloader, Soup, get_print, lazy, Session, try_n, File, clean_title, check_alive, File, get_ext, get_max_range
import dateutil.parser
import utils


class Image:

def __init__(self, post_url, date, url, page, name, data):
self.post_url = post_url
self.url = LazyUrl(post_url, lambda _: url.replace('/large/', '/4k/'), self, url)
self.page = page
self.data = data
ext = os.path.splitext(url.split('?')[0])[1]
self.filename = f'[{date}] {name}_p{page}{ext}'
class File_artstation(File):
type = 'artstation'
format = '[date] name_ppage'
c_alter = 0

def __repr__(self):
return f'Image({self.filename})'
def alter(self): #6401
self.c_alter += 1
if self.c_alter % 2 == 0:
url = self['url']
else:
url = self['url'].replace('/4k/', '/large/')
return url



Expand All @@ -29,7 +29,14 @@ class Downloader_artstation(Downloader):
ACCEPT_COOKIES = [r'(.*\.)?artstation\.(com|co)']
url_main = None

@try_n(8)
def init(self):
# 3849
self.session = Session()

import clf2
clf2.solve(self.url, self.session, self.cw)

_ = self._id.replace('artstation_', '', 1)
self.url_main = f'https://www.artstation.com/{_}'

Expand All @@ -39,9 +46,6 @@ def init(self):
self.url = self.url_main
self.print_(self.url)

# 3849
self.session = Session('chrome')

@lazy
def _id(self):
_id = get_id(self.url, self.cw)
Expand All @@ -50,8 +54,7 @@ def _id(self):
@lazy
@try_n(2)
def name(self):
html = downloader.read_html(self.url_main, session=self.session)
soup = Soup(html)
soup = downloader.read_soup(self.url_main, session=self.session)
name = soup.find('meta', {'property': 'og:title'}).attrs['content']
return clean_title(f'{name} ({self._id})')

Expand All @@ -68,7 +71,7 @@ def read(self):
imgs = get_imgs(id_, self.title, self.session, cw=self.cw)

for img in imgs:
self.urls.append(img.url)
self.urls.append(img)

self.title = self.name

Expand Down Expand Up @@ -127,11 +130,13 @@ def get_imgs(id_, title, session, cw=None):
while i < len(datas):
check_alive(cw)
data = datas[i]
date = data['created_at'][2:10]
date = data['created_at']
post_url = data['permalink']
#print('post_url', post_url)
id_art = get_id_art(post_url)
imgs += get_imgs_page(id_art, session, date=date, cw=cw, names=names)
if len(imgs) >= get_max_range(cw):
break
if cw:
cw.setTitle(f'{tr_("이미지 읽는 중...")} {title} - {i+1} / {len(datas)} ({len(imgs)})')
else:
Expand Down Expand Up @@ -188,31 +193,29 @@ def get_imgs_page(id_art, session, date=None, cw=None, names=None):
names.add(name.lower())

try:
html = downloader.read_html(url_json, session=session, referer=post_url)
data = json.loads(html)
data = downloader.read_json(url_json, session=session, referer=post_url)
imgs_ = data['assets']
except Exception as e:
print_(print_error(e))
return []

if date is None:
date = data['created_at'][2:10]
date = data['created_at']
date = dateutil.parser.parse(date)

imgs = []
for page, img in enumerate(imgs_):
if not img['has_image']:
print('no img')
continue
url = None
video = None
embed = img.get('player_embedded')
if embed:
soup = Soup(embed)
url_embed = soup.find('iframe').attrs['src']
print_(f'embed: {url_embed}')
try:
html = downloader.read_html(url_embed, session=session, referer=post_url)
soup = Soup(html)
soup = downloader.read_soup(url_embed, post_url, session=session)
v = soup.find('video')
if v:
url = v.find('source').attrs['src']
Expand All @@ -223,19 +226,20 @@ def get_imgs_page(id_art, session, date=None, cw=None, names=None):
url = soup.find('link', {'rel': 'canonical'}).attrs['href']
print_(f'YouTube: {url}')
raise Exception('YouTube')
## from extractor import youtube_downloader
## video = youtube_downloader.Video(url, cw=cw)
## video.data = data
except Exception as e:
print(e)
url = None
if not url:
url = img['image_url']

if video:
img = video
else:
img = Image(post_url, date, url, page, name, data)
d = {
'date': date,
'name': name,
'page': page,
}
filename = utils.format('artstation', d, get_ext(url))
img = File_artstation({'referer':post_url, 'url':url.replace('/large/', '/4k/'), 'name': filename})
img.data = data

imgs.append(img)

Expand Down
Loading

0 comments on commit 85ed80f

Please sign in to comment.