-
-
Notifications
You must be signed in to change notification settings - Fork 1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[naverpost] add 'post' and 'user' extractors
- Loading branch information
1 parent
807ddde
commit 82af247
Showing
5 changed files
with
204 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -100,6 +100,7 @@ | |
"myhentaigallery", | ||
"myportfolio", | ||
"naver", | ||
"naverpost", | ||
"naverwebtoon", | ||
"newgrounds", | ||
"nhentai", | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,143 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
# This program is free software; you can redistribute it and/or modify | ||
# it under the terms of the GNU General Public License version 2 as | ||
# published by the Free Software Foundation. | ||
|
||
"""Extractors for https://post.naver.com/""" | ||
|
||
from .common import Extractor, Message | ||
from .. import text, exception | ||
import json | ||
import re | ||
|
||
BASE_PATTERN = r"(?:https?://)?(?:m\.)?post\.naver\.com" | ||
|
||
|
||
class NaverpostExtractor(Extractor): | ||
"""Base class for naver post extractors""" | ||
category = "naverpost" | ||
root = "https://post.naver.com" | ||
request_interval = (0.5, 1.5) | ||
|
||
def _call(self, url, params=None): | ||
if params is None: | ||
params = {} | ||
while True: | ||
try: | ||
return self.request(url, params=params) | ||
except exception.HttpError as exc: | ||
if exc.status == 401: | ||
raise exception.AuthenticationError() | ||
if exc.status == 403: | ||
raise exception.AuthorizationError() | ||
if exc.status == 404: | ||
raise exception.NotFoundError(self.subcategory) | ||
self.log.debug(exc) | ||
return | ||
|
||
def _pagination(self, url, params=None): | ||
if params is None: | ||
params = {} | ||
while True: | ||
res = self._call(url, params).text | ||
# the `html` string in the response contains escaped single quotes, | ||
# which would throw a JSONDecodeError exception | ||
res = json.loads(res.replace(r"\'", "'")) | ||
urls = [] | ||
endpoints = text.extract_iter( | ||
res["html"], '<div class="text_area">\n<a href="', '"') | ||
for endpoint in endpoints: | ||
urls.append(self.root + endpoint) | ||
yield from urls | ||
if "nextFromNo" not in res: | ||
return | ||
params["fromNo"] = res["nextFromNo"] | ||
|
||
|
||
class NaverpostPostExtractor(NaverpostExtractor): | ||
"""Extractor for posts on post.naver.com""" | ||
subcategory = "post" | ||
filename_fmt = "{image[id]}.{extension}" | ||
directory_fmt = ("{category}", "{author}", "{volume_no}") | ||
archive_fmt = "{image[id]}" | ||
pattern = (BASE_PATTERN + r"/viewer/postView\.(naver|nhn)" | ||
r"\?volumeNo=(\d+)(?:&.+)?") | ||
example = "https://post.naver.com/viewer/postView.naver?volumeNo=12345" | ||
|
||
def __init__(self, match): | ||
NaverpostExtractor.__init__(self, match) | ||
self.url = match.group(0) | ||
self.page_ext = match.group(1) | ||
self.volume_no = match.group(2) | ||
|
||
def metadata(self, page): | ||
data = { | ||
"title": text.unescape( | ||
text.extr(page, '"og:title" content="', '"')), | ||
"description": text.unescape( | ||
text.extr(page, '"og:description" content="', '"')), | ||
"author": text.extr(page, '"og:author" content="', '"'), | ||
"date": text.parse_datetime( | ||
text.extr(page, '"og:createdate" content="', '"'), | ||
format="%Y.%m.%d. %H:%M:%S", utcoffset=9), | ||
"volume_no": self.volume_no, | ||
"views": text.parse_int( | ||
(text.extr(page, '<span class="post_view">', ' ') or | ||
text.extr(page, '<span class="se_view" style="">', ' ') | ||
).replace(",", "")), | ||
"url": self.url, | ||
} | ||
return data | ||
|
||
def items(self): | ||
page = self._call(self.url).text | ||
data = self.metadata(page) | ||
|
||
yield Message.Directory, data | ||
|
||
image_classes = ("img_attachedfile", "se_mediaImage") | ||
image_query = r"\?type=w\d+$" | ||
for image in text.extract_iter(page, "<img", ">"): | ||
img = { | ||
"id": text.extr(image, ' id="', '"'), | ||
"title": text.extr(image, ' title="', '"'), | ||
"attachment-id": text.extr( | ||
image, ' data-attachment-id="', '"'), | ||
"alt": None, | ||
} | ||
classes = text.extr(image, ' class="', '"').split() | ||
if not any(item in classes for item in image_classes): | ||
continue | ||
url = text.extr(image, ' data-src="', '"') | ||
if not re.search(image_query, url): | ||
continue | ||
url = re.sub(image_query, "", url) | ||
img["url"] = url | ||
alt = text.extr(image, ' alt="', '"') | ||
if alt and alt.endswith(".jpg"): | ||
img["alt"] = alt | ||
data["filename"], _, data["extension"] = alt.rpartition(".") | ||
else: | ||
text.nameext_from_url(text.unquote(url), data) | ||
data["image"] = img | ||
yield Message.Url, url, data | ||
|
||
|
||
class NaverpostUserExtractor(NaverpostExtractor): | ||
"""Extractor for all posts from a user on post.naver.com""" | ||
subcategory = "user" | ||
pattern = (BASE_PATTERN + r"/my.naver\?memberNo=(\d+)") | ||
example = "https://post.naver.com/my.naver?memberNo=12345" | ||
|
||
def __init__(self, match): | ||
NaverpostExtractor.__init__(self, match) | ||
self.member_no = match.group(1) | ||
|
||
def items(self): | ||
data = {"_extractor": NaverpostPostExtractor} | ||
url = self.root + "/async/my.naver" | ||
params = {"memberNo": self.member_no} | ||
posts = self._pagination(url, params) | ||
for url in posts: | ||
yield Message.Queue, url, data |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
# This program is free software; you can redistribute it and/or modify | ||
# it under the terms of the GNU General Public License version 2 as | ||
# published by the Free Software Foundation. | ||
|
||
from gallery_dl.extractor import naverpost | ||
|
||
IMAGE_URL_PATTERN = r"(?i)https://post-phinf\.pstatic\.net/.*\.(?:jpe?g|png|gif|webp)" | ||
|
||
|
||
__tests__ = ( | ||
{ | ||
"#url": "https://m.post.naver.com/viewer/postView.nhn?volumeNo=15861102&memberNo=16220685", | ||
"#comment": ".nhn page extension", | ||
"#category": ("", "naverpost", "post"), | ||
"#class": naverpost.NaverpostPostExtractor, | ||
"#pattern": IMAGE_URL_PATTERN, | ||
"#count": 34, | ||
|
||
"title": "[쇼! 음악중심] 180526 방탄소년단 FAKE LOVE 현장 포토", | ||
"description": "[BY MBC예능연구소] [쇼! 음악중심] 589회, 20180526 ※본 콘텐츠는 상업적 용도의 사용을 금합니다.", | ||
"author": "MBC예능연구소", | ||
"date": "dt:2018-05-29 12:09:34", | ||
"views": int, | ||
}, | ||
|
||
{ | ||
"#url": "https://post.naver.com/viewer/postView.naver?volumeNo=31389956&memberNo=29156514", | ||
"#comment": ".naver page extension", | ||
"#category": ("", "naverpost", "post"), | ||
"#class": naverpost.NaverpostPostExtractor, | ||
"#pattern": IMAGE_URL_PATTERN, | ||
"#count": 48, | ||
|
||
"title": "매일 밤 꿈꿔 왔던 드림캐쳐 '바람아' 활동 비하인드 현장", | ||
"description": "[BY 드림캐쳐컴퍼니] 안녕하세요.드림캐쳐 포스트 지기입니다!(*・▽・*)'Odd Eye' 활동이 끝나고 아쉬웠을...", | ||
"author": "드림캐쳐컴퍼니", | ||
"date": "dt:2021-05-03 06:00:09", | ||
"views": int, | ||
}, | ||
|
||
{ | ||
"#url": "https://post.naver.com/my.naver?memberNo=29156514", | ||
"#comment": "up to 20 posts are returned per request", | ||
"#category": ("", "naverpost", "user"), | ||
"#class": naverpost.NaverpostUserExtractor, | ||
"#pattern": naverpost.NaverpostPostExtractor.pattern, | ||
"#range": "1-21", | ||
"#count": 21, | ||
}, | ||
|
||
) |