Skip to content

Commit

Permalink
Further fix for #2567
Browse files Browse the repository at this point in the history
Style check
Quickstatement fix
  • Loading branch information
fnielsen committed Dec 4, 2024
1 parent 5356b4c commit be8bdf7
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 11 deletions.
6 changes: 5 additions & 1 deletion scholia/qs.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,8 @@ def paper_to_quickstatements(paper):
Notes
-----
title, authors (list), date, doi, year, language_q, volume, issue, pages,
number_of_pages, url, full_text_url, published_in_q are recognized.
number_of_pages, url, full_text_url, published_in_q, openreview_id are
recognized.
`date` takes precedence over `year`.
Expand Down Expand Up @@ -226,6 +227,9 @@ def paper_to_quickstatements(paper):
if 'published_in_q' in paper and paper['published_in_q']:
qs += 'LAST\tP1433\t{}\n'.format(paper['published_in_q'])

if 'openreview_id' in paper and paper['openreview_id']:
qs += 'LAST\tP8968\t"{}"\n'.format(paper['openreview_id'])

return qs


Expand Down
26 changes: 16 additions & 10 deletions scholia/scrape/openreview.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
This module can be used as a script or imported as a module to extract metadata
from OpenReview.net submissions. It downloads the submission page and extracts
metadata such as title, authors, date of publication, OpenReview submission ID,
PDF link, and license (if available) and output it in the QuickStatement format.
PDF link, and license (if available) and output it in the QuickStatement
format.
Usage:
scholia.scrape.openreview paper-url-to-quickstatements <url>
Expand All @@ -22,7 +23,7 @@

import requests

from ..config import config
from ..config import config
from ..qs import paper_to_quickstatements


Expand Down Expand Up @@ -87,28 +88,32 @@ def html_to_paper(html):
json_text = script_elements[0].text
json_data = json.loads(json_text)
# Navigate through the JSON to get to the content
content = json_data.get('props', {}).get('pageProps', {}).get('forumNote', {}).get('content', {})
content = json_data.get(
'props', {}).get(
'pageProps', {}).get('forumNote', {}).get('content', {})

if 'title' in content and 'value' in content['title']:
data['title'] = content['title']['value']
if 'authors' in content and 'value' in content['authors']:
data['authors'] = content['authors']['value']
if 'abstract' in content and 'value' in content['abstract']:
data['abstract'] = content['abstract']['value']

forum_note = json_data.get('props', {}).get('pageProps', {}).get('forumNote', {})

forum_note = json_data.get('props', {}).get('pageProps',
{}).get('forumNote', {})
if 'id' in forum_note:
data['openreview_id'] = forum_note['id']
data['url'] = 'https://openreview.net/forum?id=' + forum_note['id']
data['full_text_url'] = 'https://openreview.net/pdf?id=' + forum_note['id']
data['full_text_url'] = 'https://openreview.net/pdf?id=' + \
forum_note['id']
if 'pdate' in forum_note:
pdate = forum_note['pdate']
# pdate is in milliseconds since epoch
dt = datetime.datetime.utcfromtimestamp(pdate / 1000)
data['date'] = dt.date().isoformat()
if 'licence' in forum_note:
data['license'] = forum_note['license']

return data


Expand All @@ -117,18 +122,19 @@ def main():
from docopt import docopt

arguments = docopt(__doc__)

if arguments['paper-url-to-quickstatements']:
url = arguments['<url>']
html = paper_url_to_html(url)
paper = html_to_paper(html)

# Output the data in QuickStatement format or as needed
qs = paper_to_quickstatements(paper)
print(qs)

else:
assert False


if __name__ == '__main__':
main()

0 comments on commit be8bdf7

Please sign in to comment.