Skip to content

Commit

Permalink
feat: add longblack.co as a highlight source (#1)
Browse files Browse the repository at this point in the history
  • Loading branch information
bskim45 authored Sep 17, 2024
1 parent 1ee1bed commit d73caa3
Show file tree
Hide file tree
Showing 16 changed files with 646 additions and 142 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@ coverage.xml
.hypothesis/
.pytest_cache/
cover/
junit.xml
pytest-coverage.txt


# pyenv
.python-version
Expand Down
2 changes: 2 additions & 0 deletions .pylintrc.toml
Original file line number Diff line number Diff line change
Expand Up @@ -382,6 +382,8 @@ disable = [
"use-implicit-booleaness-not-comparison-to-string",
"use-implicit-booleaness-not-comparison-to-zero",
"missing-docstring",
"duplicate-code",
"logging-fstring-interpolation",
]

# Enable the message, report, category or checker with the given id(s). You can
Expand Down
6 changes: 4 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,14 @@ test:
--cov-report term-missing:skip-covered \
--cov-report html \
--cov-report xml \
--junitxml=junit.xml \
-vvv \
tests
--pyargs ridiwise \
--cov src \
| tee pytest-coverage.txt

clean:
rm -rf htmlcov pytest-coverage.txt
rm -rf .coverage htmlcov coverage.xml pytest-coverage.txt junit.xml

### Docker
DOCKER_REPO ?= bskim45/ridiwise
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# ridiwise

Sync Ridibooks book notes to Readwise.io
Sync book/article highlights to Readwise.io

## Installation

Expand Down Expand Up @@ -49,7 +49,7 @@ pipx install git+https://github.com/bskim45/ridiwise.git
$ ridiwise --help
Usage: ridiwise [OPTIONS] COMMAND [ARGS]...

ridiwise: Sync Ridibooks book notes to Readwise.io
ridiwise: Sync book/article highlights to Readwise.io

(...)
```
Expand Down
16 changes: 12 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@ authors = [
{ name = "Bumsoo Kim", email = "[email protected]" }
]
dependencies = [
"typer>=0.12.3",
"httpx>=0.27.0",
"browser-cookie3>=0.19.1",
"playwright>=1.45.1",
"typer>=0.12.3",
"httpx>=0.27.0",
"browser-cookie3>=0.19.1",
"playwright>=1.45.1",
]
readme = "README.md"
license = { file = "LICENSE" }
Expand All @@ -31,6 +31,7 @@ dev-dependencies = [
"ruff>=0.5.4",
"pylint>=3.2.6",
"bump-my-version>=0.24.3",
"pytest-cov>=5.0.0",
]


Expand Down Expand Up @@ -70,3 +71,10 @@ allow_dirty = true

[[tool.bumpversion.files]]
filename = 'src/ridiwise/__init__.py'


[tool.coverage.run]
parallel = true
omit = [
"*/tests/*",
]
4 changes: 4 additions & 0 deletions requirements-dev.lock
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ click==8.1.7
# via bump-my-version
# via rich-click
# via typer
coverage==7.6.1
# via pytest-cov
dill==0.3.8
# via pylint
distlib==0.3.8
Expand Down Expand Up @@ -90,6 +92,8 @@ pygments==2.18.0
# via rich
pylint==3.2.6
pytest==8.3.1
# via pytest-cov
pytest-cov==5.0.0
python-dotenv==1.0.1
# via pydantic-settings
pyyaml==6.0.1
Expand Down
212 changes: 212 additions & 0 deletions src/ridiwise/api/longblack.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,212 @@
import datetime
import re
import urllib.parse
from typing import Optional, TypedDict
from zoneinfo import ZoneInfo

from playwright.sync_api import (
Locator,
)
from playwright.sync_api import (
TimeoutError as PlaywrightTimeoutError,
)

from ridiwise.api.browser_base_client import BrowserBaseClient

DOMAIN = 'www.longblack.co'
COOKIE_DOMAIN = f'https://{DOMAIN}'

SELECTOR_LOGIN_USER_ID = 'form.login-form input[name="email"]'
SELECTOR_LOGIN_PASSWORD = 'form.login-form input[name="password"]'
SELECTOR_LOGIN_BUTTON = 'form.login-form button[type="submit"]'


class Note(TypedDict):
"""
Article
"""

note_id: str
note_url: str
title: str
author: Optional[str]
cover_image_url: Optional[str]


class Scrap(TypedDict):
"""
Highlight
"""

scrap_id: str
scrap_url: str
highlighted_text: str
memo: Optional[str]
created_datetime: datetime.datetime
note: Note


class LongblackClient(BrowserBaseClient):
base_url = f'https://{DOMAIN}'
provider = 'longblack'
storage_state_filename = f'browser_state_{provider}.json'

def __init__(
self,
user_id: str,
password: str,
*args,
**kwargs,
):
self.user_id = user_id
self.password = password

super().__init__(*args, **kwargs)

@staticmethod
def parse_scrap_url(url) -> Optional[tuple[str, str]]:
"""
Extracts the book_id from a given URI.
"""
pattern = re.compile(r'/note/(\d+).*#memoId=([A-Za-z0-9]+)')
match = pattern.search(url)
if match:
note_id, scrap_id = match.groups()
return note_id, scrap_id
return None

@staticmethod
def parse_scrap_date(datetime_string) -> Optional[datetime.datetime]:
"""
Parses a date string in the format 'YYYY.MM.DD HH:MM'
"""
datetime_format = '%Y.%m.%d %H:%M'
dt = datetime.datetime.strptime(datetime_string, datetime_format)
dt = dt.replace(tzinfo=ZoneInfo('Asia/Seoul'))
return dt

@staticmethod
def get_author_from_scrap_title(title: str) -> Optional[str]:
"""
Extracts the author from the scrap title.
"""
if ':' not in title:
return None

parts = title.split(':', 1)
author = parts[0].strip()
title = parts[1].strip()

if not author or not title:
return None

return author

def login(self):
self.logger.info(f'Login: `{DOMAIN}`')

with self.browser_context.new_page() as page:
page.goto(f'{self.base_url}/login?return_url=/membership')
page.wait_for_selector(SELECTOR_LOGIN_USER_ID)

page.locator(SELECTOR_LOGIN_USER_ID).fill(self.user_id)
page.locator(SELECTOR_LOGIN_PASSWORD).fill(self.password)

page.click(SELECTOR_LOGIN_BUTTON)

try:
page.wait_for_url('**/membership')
self.cache_dir.mkdir(parents=True, exist_ok=True)
self.browser_context.storage_state(
path=self.cache_dir / self.storage_state_filename
)
except PlaywrightTimeoutError as e:
self.logger.error('Login timeout')
raise e

def is_authenticated(self) -> bool:
with self.browser_context.new_page() as page:
res = page.request.get(f'{self.base_url}/membership', max_redirects=0)
return res.ok

def get_scraps(self) -> list[Scrap]:
if not self.is_authenticated():
self.logger.info('Login required')
self.login()

scraps = []

# get recent 20 pages only to avoid spamming the server
for page_num in range(1, 21):
query_params = urllib.parse.urlencode(
{
'page': page_num,
'view': 'note',
'sort': 'latest',
'search': '',
}
)

with self.browser_context.new_page() as page:
page.goto(f'{self.base_url}/scrap?{query_params}')
items = page.locator('.swiper-slide:has(div.scrap)').all()

if not items:
break

scraps.extend([self._parse_dom(item) for item in items])

return scraps

def _parse_dom(self, elem: Locator) -> Scrap:
highlighted_text = elem.locator('.scrap-content').inner_text().strip()
date_str = elem.locator('.date').text_content().strip()
scrap_date = self.parse_scrap_date(date_str)

note_info = elem.locator('a.note-info')

scrap_url = note_info.get_attribute('href')
note_id, scrap_id = self.parse_scrap_url(scrap_url)
note_title = note_info.locator('span').text_content().strip()
note_cover_image_url = note_info.locator('img').get_attribute('src')

memo = self._get_memo(elem)

return {
'scrap_id': scrap_id,
'scrap_url': scrap_url,
'highlighted_text': highlighted_text,
'memo': memo,
'created_datetime': scrap_date,
'note': {
'title': note_title,
'note_url': f'{self.base_url}/note/{note_id}',
'note_id': note_id,
'author': self.get_author_from_scrap_title(note_title),
'cover_image_url': note_cover_image_url,
},
}

@staticmethod
def _get_memo(elem: Locator) -> Optional[str]:
memo_button = elem.locator('.actions').locator('button.show-memo')
indicator = memo_button.locator('.memo-icon.dot')

if not indicator.is_visible():
return None

memo_button.click()
memo_modals = elem.page.locator('.memo-modal')
memo_modal = memo_modals.locator('visible=true')

if not memo_modal.is_visible():
try:
memo_modal = memo_modals.last
memo_modal.wait_for(state='visible')
except PlaywrightTimeoutError:
return None

memo = memo_modal.get_by_role('textbox').input_value()
memo_modal.locator('.actions').locator('button.negative').click()

return memo
45 changes: 4 additions & 41 deletions src/ridiwise/cmd/common_option.py
Original file line number Diff line number Diff line change
@@ -1,60 +1,26 @@
import enum
from typing import Optional
from collections import defaultdict

import typer

from ridiwise.cmd.context import AuthState, ContextState


@enum.unique
class RidiAuthMethod(enum.StrEnum):
# BROWSER_COOKIE = 'browser_cookie'
HEADLESS_BROWSER = 'headless_browser'
from ridiwise.cmd.context import ContextState


def check_common_options(
ctx: typer.Context,
auth_method: RidiAuthMethod,
user_id: Optional[str],
password: Optional[str],
headless_mode: bool,
browser_timeout_seconds: int,
):
context: ContextState = ctx.ensure_object(dict)

auth_state: AuthState = {
'auth_method': auth_method,
}
if 'auths' not in context:
context['auths'] = defaultdict()

context['auth'] = auth_state
context['headless_mode'] = headless_mode
context['browser_timeout_seconds'] = browser_timeout_seconds

if auth_method == RidiAuthMethod.HEADLESS_BROWSER:
if not all([user_id, password]):
raise typer.BadParameter('`user_id` and `password` must be provided.')

auth_state['user_id'] = user_id
auth_state['password'] = password


def common_params(
ctx: typer.Context,
auth_method: RidiAuthMethod = typer.Option(
default=RidiAuthMethod.HEADLESS_BROWSER,
envvar='RIDI_AUTH_METHOD',
help='Authentication method to use with Ridibooks.',
),
user_id: Optional[str] = typer.Option(
default=None,
envvar='RIDI_USER_ID',
help='Ridibooks user ID.',
),
password: Optional[str] = typer.Option(
default=None,
envvar='RIDI_PASSWORD',
help='Ridibooks password.',
),
headless_mode: bool = typer.Option(
True,
envvar='HEADLESS_MODE',
Expand All @@ -69,9 +35,6 @@ def common_params(
ctx.ensure_object(dict)
check_common_options(
ctx=ctx,
auth_method=auth_method,
user_id=user_id,
password=password,
headless_mode=headless_mode,
browser_timeout_seconds=browser_timeout_seconds,
)
Loading

0 comments on commit d73caa3

Please sign in to comment.