Skip to content

Commit

Permalink
[naverpost] add 'post' and 'user' extractors
Browse files Browse the repository at this point in the history
  • Loading branch information
bradenhilton committed Nov 12, 2023
1 parent 4288cea commit 116027c
Show file tree
Hide file tree
Showing 5 changed files with 204 additions and 0 deletions.
6 changes: 6 additions & 0 deletions docs/supportedsites.md
Original file line number Diff line number Diff line change
Expand Up @@ -559,6 +559,12 @@ Consider all sites to be NSFW unless otherwise known.
<td>Blogs, Posts</td>
<td></td>
</tr>
<tr>
<td>NaverPost</td>
<td>https://post.naver.com/</td>
<td>Posts, User Profiles</td>
<td></td>
</tr>
<tr>
<td>NaverWebtoon</td>
<td>https://comic.naver.com/</td>
Expand Down
1 change: 1 addition & 0 deletions gallery_dl/extractor/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@
"myhentaigallery",
"myportfolio",
"naver",
"naverpost",
"naverwebtoon",
"newgrounds",
"nhentai",
Expand Down
143 changes: 143 additions & 0 deletions gallery_dl/extractor/naverpost.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
# -*- coding: utf-8 -*-

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extractors for https://post.naver.com/"""

from .common import Extractor, Message
from .. import text, exception
import json
import re

BASE_PATTERN = r"(?:https?://)?(?:m\.)?post\.naver\.com"


class NaverpostExtractor(Extractor):
"""Base class for naver post extractors"""
category = "naverpost"
root = "https://post.naver.com"
request_interval = (0.5, 1.5)

def _call(self, url, params=None):
if params is None:
params = {}
while True:
try:
return self.request(url, params=params)
except exception.HttpError as exc:
if exc.status == 401:
raise exception.AuthenticationError()
if exc.status == 403:
raise exception.AuthorizationError()
if exc.status == 404:
raise exception.NotFoundError(self.subcategory)
self.log.debug(exc)
return

def _pagination(self, url, params=None):
if params is None:
params = {}
while True:
res = self._call(url, params).text
# the `html` string in the response contains escaped single quotes,
# which would throw a JSONDecodeError exception
res = json.loads(res.replace(r"\'", "'"))
urls = []
endpoints = text.extract_iter(
res["html"], '<div class="text_area">\n<a href="', '"')
for endpoint in endpoints:
urls.append(self.root + endpoint)
yield from urls
if "nextFromNo" not in res:
return
params["fromNo"] = res["nextFromNo"]


class NaverpostPostExtractor(NaverpostExtractor):
"""Extractor for posts on post.naver.com"""
subcategory = "post"
filename_fmt = "{image[id]}.{extension}"
directory_fmt = ("{category}", "{author}", "{volume_no}")
archive_fmt = "{image[id]}"
pattern = (BASE_PATTERN + r"/viewer/postView\.(naver|nhn)"
r"\?volumeNo=(\d+)(?:&.+)?")
example = "https://post.naver.com/viewer/postView.naver?volumeNo=12345"

def __init__(self, match):
NaverpostExtractor.__init__(self, match)
self.url = match.group(0)
self.page_ext = match.group(1)
self.volume_no = match.group(2)

def metadata(self, page):
data = {
"title": text.unescape(
text.extr(page, '"og:title" content="', '"')),
"description": text.unescape(
text.extr(page, '"og:description" content="', '"')),
"author": text.extr(page, '"og:author" content="', '"'),
"date": text.parse_datetime(
text.extr(page, '"og:createdate" content="', '"'),
format="%Y.%m.%d. %H:%M:%S", utcoffset=9),
"volume_no": self.volume_no,
"views": text.parse_int(
(text.extr(page, '<span class="post_view">', ' ') or
text.extr(page, '<span class="se_view" style="">', ' ')
).replace(",", "")),
"url": self.url,
}
return data

def items(self):
page = self._call(self.url).text
data = self.metadata(page)

yield Message.Directory, data

image_classes = ("img_attachedfile", "se_mediaImage")
image_query = r"\?type=w\d+$"
for image in text.extract_iter(page, "<img", ">"):
img = {
"id": text.extr(image, ' id="', '"'),
"title": text.extr(image, ' title="', '"'),
"attachment-id": text.extr(
image, ' data-attachment-id="', '"'),
"alt": None,
}
classes = text.extr(image, ' class="', '"').split()
if not any(item in classes for item in image_classes):
continue
url = text.extr(image, ' data-src="', '"')
if not re.search(image_query, url):
continue
url = re.sub(image_query, "", url)
img["url"] = url
alt = text.extr(image, ' alt="', '"')
if alt and alt.endswith(".jpg"):
img["alt"] = alt
data["filename"], _, data["extension"] = alt.rpartition(".")
else:
text.nameext_from_url(text.unquote(url), data)
data["image"] = img
yield Message.Url, url, data


class NaverpostUserExtractor(NaverpostExtractor):
"""Extractor for all posts from a user on post.naver.com"""
subcategory = "user"
pattern = BASE_PATTERN + r"/my\.naver\?memberNo=(\d+)"
example = "https://post.naver.com/my.naver?memberNo=12345"

def __init__(self, match):
NaverpostExtractor.__init__(self, match)
self.member_no = match.group(1)

def items(self):
data = {"_extractor": NaverpostPostExtractor}
endpoint = self.root + "/async/my.naver"
params = {"memberNo": self.member_no}
posts = self._pagination(endpoint, params)
for url in posts:
yield Message.Queue, url, data
1 change: 1 addition & 0 deletions scripts/supportedsites.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@
"mastodon.social": "mastodon.social",
"myhentaigallery": "My Hentai Gallery",
"myportfolio" : "Adobe Portfolio",
"naverpost" : "NaverPost",
"naverwebtoon" : "NaverWebtoon",
"nhentai" : "nhentai",
"nijie" : "nijie",
Expand Down
53 changes: 53 additions & 0 deletions test/results/naverpost.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# -*- coding: utf-8 -*-

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

from gallery_dl.extractor import naverpost

IMAGE_URL_PATTERN = r"(?i)https://post-phinf\.pstatic\.net/.*\.(?:jpe?g|png|gif|webp)"


__tests__ = (
{
"#url": "https://m.post.naver.com/viewer/postView.nhn?volumeNo=15861102&memberNo=16220685",
"#comment": ".nhn page extension",
"#category": ("", "naverpost", "post"),
"#class": naverpost.NaverpostPostExtractor,
"#pattern": IMAGE_URL_PATTERN,
"#count": 34,

"title": "[쇼! 음악중심] 180526 방탄소년단 FAKE LOVE 현장 포토",
"description": "[BY MBC예능연구소] [쇼! 음악중심] 589회, 20180526 ※본 콘텐츠는 상업적 용도의 사용을 금합니다.",
"author": "MBC예능연구소",
"date": "dt:2018-05-29 12:09:34",
"views": int,
},

{
"#url": "https://post.naver.com/viewer/postView.naver?volumeNo=31389956&memberNo=29156514",
"#comment": ".naver page extension",
"#category": ("", "naverpost", "post"),
"#class": naverpost.NaverpostPostExtractor,
"#pattern": IMAGE_URL_PATTERN,
"#count": 48,

"title": "매일 밤 꿈꿔 왔던 드림캐쳐 '바람아' 활동 비하인드 현장",
"description": "[BY 드림캐쳐컴퍼니] 안녕하세요.드림캐쳐 포스트 지기입니다!(*・▽・*)'Odd Eye' 활동이 끝나고 아쉬웠을...",
"author": "드림캐쳐컴퍼니",
"date": "dt:2021-05-03 06:00:09",
"views": int,
},

{
"#url": "https://post.naver.com/my.naver?memberNo=29156514",
"#comment": "up to 20 posts are returned per request",
"#category": ("", "naverpost", "user"),
"#class": naverpost.NaverpostUserExtractor,
"#pattern": naverpost.NaverpostPostExtractor.pattern,
"#range": "1-21",
"#count": 21,
},

)

0 comments on commit 116027c

Please sign in to comment.