Skip to content

Commit

Permalink
Partial OpenReview scraping
Browse files Browse the repository at this point in the history
Framework for OpenReview scraping.
Works with some ID.
Does not work with https://openreview.net/forum?id=0g0X4H8yN4I
Does not test if the paper exists in Wikidata.
  • Loading branch information
fnielsen committed Dec 4, 2024
1 parent e265d46 commit 5356b4c
Showing 1 changed file with 134 additions and 0 deletions.
134 changes: 134 additions & 0 deletions scholia/scrape/openreview.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
"""
Module for scraping metadata from OpenReview.net submissions.
This module can be used as a script or imported as a module to extract metadata
from OpenReview.net submissions. It downloads the submission page and extracts
metadata such as title, authors, date of publication, OpenReview submission ID,
PDF link, and license (if available) and output it in the QuickStatement format.
Usage:
scholia.scrape.openreview paper-url-to-quickstatements <url>
As a module:
html = paper_url_to_html('aVh9KRZdRk')
data = html_to_paper(html)
"""

import datetime
import json

from lxml import etree

import requests

from ..config import config
from ..qs import paper_to_quickstatements


USER_AGENT = config['requests'].get('user_agent')


def paper_url_to_html(identifier):
"""Download the HTML content from an OpenReview.net submission page.
Parameters
----------
identifier : str
The URL or the submission ID of the OpenReview.net submission.
Returns
-------
str
The HTML content of the page.
Examples
--------
>>> html = paper_url_to_html('https://openreview.net/forum?id=aVh9KRZdRk')
"""
if identifier.startswith('http'):
url = identifier
else:
url = f'https://openreview.net/forum?id={identifier}'
headers = {'User-Agent': USER_AGENT}
response = requests.get(url, headers=headers)
response.raise_for_status()
return response.text


def html_to_paper(html):
"""Extract metadata from the OpenReview.net submission page HTML.
Parameters
----------
html : str
The HTML content of the OpenReview.net submission page.
Returns
-------
dict
A dictionary containing metadata about the submission.
Examples
--------
>>> html = paper_url_to_html('https://openreview.net/forum?id=aVh9KRZdRk')
>>> paper = html_to_paper(html)
>>> paper['title'].startswith('Learning to grok')
True
"""
tree = etree.HTML(html)
data = {}

# Find the script tag with id="__NEXT_DATA__"
script_elements = tree.xpath('//script[@id="__NEXT_DATA__"]')
if script_elements:
json_text = script_elements[0].text
json_data = json.loads(json_text)
# Navigate through the JSON to get to the content
content = json_data.get('props', {}).get('pageProps', {}).get('forumNote', {}).get('content', {})

if 'title' in content and 'value' in content['title']:
data['title'] = content['title']['value']
if 'authors' in content and 'value' in content['authors']:
data['authors'] = content['authors']['value']
if 'abstract' in content and 'value' in content['abstract']:
data['abstract'] = content['abstract']['value']

forum_note = json_data.get('props', {}).get('pageProps', {}).get('forumNote', {})
if 'id' in forum_note:
data['openreview_id'] = forum_note['id']
data['url'] = 'https://openreview.net/forum?id=' + forum_note['id']
data['full_text_url'] = 'https://openreview.net/pdf?id=' + forum_note['id']
if 'pdate' in forum_note:
pdate = forum_note['pdate']
# pdate is in milliseconds since epoch
dt = datetime.datetime.utcfromtimestamp(pdate / 1000)
data['date'] = dt.date().isoformat()
if 'licence' in forum_note:
data['license'] = forum_note['license']

return data


def main():
"""Handle command-line interface."""
from docopt import docopt

arguments = docopt(__doc__)

if arguments['paper-url-to-quickstatements']:
url = arguments['<url>']
html = paper_url_to_html(url)
paper = html_to_paper(html)

# Output the data in QuickStatement format or as needed
qs = paper_to_quickstatements(paper)
print(qs)

else:
assert False

if __name__ == '__main__':
main()

0 comments on commit 5356b4c

Please sign in to comment.