Skip to content

Commit

Permalink
Merge pull request #143 from koddas/master
Browse files Browse the repository at this point in the history
Added lxml_html_clean and updated pdfplumber dependencies
  • Loading branch information
GjjvdBurg authored Aug 8, 2024
2 parents 780eee6 + a5a81bf commit c645528
Show file tree
Hide file tree
Showing 6 changed files with 119 additions and 2 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ reMarkable from any of the following sources:
* [ACL Web](https://www.aclweb.org/anthology/)
* [ACM Digital Library](https://dl.acm.org/dl.cfm)
* [CVF](https://openaccess.thecvf.com/menu)
* [DiVA](https://diva-portal.org/)
* [ECCC](https://eccc.weizmann.ac.il/reports/menu/)
* [IACR](https://eprint.iacr.org/)
* [JMLR](http://jmlr.org)
Expand Down
15 changes: 15 additions & 0 deletions paper2remarkable/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,3 +152,18 @@ def __str__(self):
"as paper2remarkable gets blocked by CloudFlare.\n"
)
return msg

class FulltextMissingError(Error):
"""Exception raised when the fulltext PDF can't be found."""

def __init__(self, provider, url):
self.provider = provider
self.url = url

def __str__(self):
msg = (
"ERROR: Couldn't find the fulltext PDF for the following url:\n"
f"\t{self.provider}\n"
f"\t{self.url}\n"
)
return msg
2 changes: 2 additions & 0 deletions paper2remarkable/providers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from .arxiv import Arxiv
from .citeseerx import CiteSeerX # disabled, incomplete html doc received
from .cvf import CVF
from .diva import DiVA
from .eccc import ECCC
from .html import HTML
from .iacr import IACR
Expand Down Expand Up @@ -33,6 +34,7 @@
ACM,
Arxiv,
CVF,
DiVA,
ECCC,
IACR,
JMLR,
Expand Down
76 changes: 76 additions & 0 deletions paper2remarkable/providers/diva.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
# -*- coding: utf-8 -*-

"""Provider for DiVA - Digitala Vetenskapliga Arkivet
Author: G.J.J. van den Burg, Johan Holmberg
License: See LICENSE file
Copyright: 2019, 2024, G.J.J. van den Burg, Johan Holmberg
"""

import os
import re
import urllib.parse

import bs4

from ..exceptions import URLResolutionError, FulltextMissingError
from ..log import Logger
from ..utils import get_page_with_retry
from ._base import Provider
from ._info import Informer

logger = Logger()


class DiVAInformer(Informer):
def get_year(self, soup):
year = soup.find("meta", {"name": "citation_publication_date"}).get("content")
if not year:
logger.warning(
"Couldn't determine year information, maybe provide the desired filename using '--filename'?"
)
return ""
return year


class DiVA(Provider):
re_abs = "^https?://[a-z]+.diva-portal.org/smash/record.jsf"
re_pdf = "^https?://[a-z]+.diva-portal.org/smash/get/diva2:[0-9]+/FULLTEXT"

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.informer = DiVAInformer()

def _get_doc_url(self, abs_url):
page = get_page_with_retry(abs_url)
soup = bs4.BeautifulSoup(page, "html.parser")

pdf_url = soup.find("meta", {"name": "citation_pdf_url"})
if pdf_url is None:
logger.warning(
"Couldn't find the fulltext URL"
)
raise FulltextMissingError("DiVA", abs_url)

return pdf_url.get("content")

def _get_abs_url(self, pdf_url):
diva_id = re.findall("diva2:[0-9]+", pdf_url)[0].split(":")[1]
url_candiate = re.findall("https?://[a-z]+.diva-portal.org/smash/", pdf_url)[0]
url_candiate += "record.jsf?pid=diva2%3A" + diva_id
return url_candiate

def get_abs_pdf_urls(self, url):
if re.match(self.re_abs, url):
abs_url = url
pdf_url = self._get_doc_url(url)
elif re.match(self.re_pdf, url):
abs_url = self._get_abs_url(url)
pdf_url = url
else:
raise URLResolutionError("DiVA", url)
return abs_url, pdf_url

def validate(src):
return re.match(DiVA.re_abs, src) or re.match(DiVA.re_pdf, src)
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,9 @@
REQUIRED = [
"beautifulsoup4>=4.8",
"html2text>=2020.1.16",
"lxml_html_clean>=0.1.1",
"markdown>=3.1.1",
"pdfplumber>=0.5",
"pdfplumber>=0.11",
"pikepdf>=2.9.0",
"pycryptodome",
"pyyaml>=5.1",
Expand Down
24 changes: 23 additions & 1 deletion tests/test_providers.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,11 @@
from _constants import TEST_FILE
from pikepdf import Pdf

from paper2remarkable.exceptions import URLResolutionError
from paper2remarkable.exceptions import URLResolutionError, FulltextMissingError
from paper2remarkable.providers import ACL
from paper2remarkable.providers import ACM
from paper2remarkable.providers import CVF
from paper2remarkable.providers import DiVA
from paper2remarkable.providers import ECCC
from paper2remarkable.providers import HTML
from paper2remarkable.providers import IACR
Expand Down Expand Up @@ -552,6 +553,27 @@ def test_iacr_3(self):
filename = prov.run(url)
self.assertEqual(exp, os.path.basename(filename))

def test_diva_1(self):
# Testing redirections from Kungliga biblioteket
prov = DiVA(upload=False, verbose=VERBOSE)
url = "https://urn.kb.se/resolve?urn=urn:nbn:se:uu:diva-318796"
exp = "Lidayova_-_Fast_Methods_for_Vascular_Segmentation_Based_on_Approximate_Skeleton_Detection_2017.pdf"
filename = prov.run(url)
self.assertEqual(exp, os.path.basename(filename))

def test_diva_2(self):
# Testing absolute URLs and sanitization of filenames
prov = DiVA(upload=False, verbose=VERBOSE)
url = "https://www.diva-portal.org/smash/record.jsf?pid=diva2%3A1480467"
exp = "Alhussein_-_Privacy_by_Design_Amp_Internet_of_Things_Managing_Privacy_2018.pdf"
filename = prov.run(url)
self.assertEqual(exp, os.path.basename(filename))

def test_diva_3(self):
# Testing older entries without available fulltext
prov = DiVA(upload=False, verbose=VERBOSE)
url = "https://uu.diva-portal.org/smash/record.jsf?pid=diva2%3A59234"
self.assertRaises(FulltextMissingError, prov.run, url)

if __name__ == "__main__":
unittest.main()

0 comments on commit c645528

Please sign in to comment.