From 2eee77d6eb804940928d211ddfc80106c85064d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E9=98=B3?= <1070942929@qq.com> Date: Thu, 7 Dec 2017 11:42:19 +0800 Subject: [PATCH 1/2] MacOS Python3.6.3 PyCharm --- .gitignore | 3 +++ .idea/vcs.xml | 6 +++++ download.py | 63 ++++++++++++++++++++++++++++----------------------- 3 files changed, 44 insertions(+), 28 deletions(-) create mode 100644 .gitignore create mode 100644 .idea/vcs.xml diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b2f5cb7 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +/.idea +/pdfs +**/.DS_Store \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/download.py b/download.py index 1442527..d0ec812 100644 --- a/download.py +++ b/download.py @@ -3,7 +3,7 @@ import re from six.moves.urllib.request import urlopen from six.moves.urllib.error import HTTPError -import urllib2 +import urllib import shutil import argparse import mistune @@ -13,48 +13,55 @@ import requests # encoding=utf8 -import sys +import sys + + +# reload(sys) +# sys.setdefaultencoding('utf8') -reload(sys) -sys.setdefaultencoding('utf8') def download_pdf(link, location, name): try: response = requests.get(link) with open(os.path.join(location, name), 'wb') as f: - f.write(response.content) - f.close() + f.write(response.content) + f.close() except HTTPError: - print('>>> Error 404: cannot be downloaded!\n') - raise + print('>>> Error 404: cannot be downloaded!\n') + raise except socket.timeout: - print(" ".join(("can't download", link, "due to connection timeout!")) ) + print(" ".join(("can't download", link, "due to connection timeout!"))) raise + def clean_pdf_link(link): if 'arxiv' in link: - link = link.replace('abs', 'pdf') - if not(link.endswith('.pdf')): + link = link.replace('abs', 'pdf') + if not (link.endswith('.pdf')): link = '.'.join((link, 'pdf')) print(link) return link -def clean_text(text, replacements = {':': '_', ' ': '_', '/': '_', '.': '', '"': ''}): + +def clean_text(text, replacements={':': '_', ' ': '_', '/': '_', '.': '', '"': ''}): for key, rep in replacements.items(): text = text.replace(key, rep) - return text + return text + + +def print_title(title, pattern="-"): + print('\n'.join(("", title, pattern * len(title)))) -def print_title(title, pattern = "-"): - print('\n'.join(("", title, pattern * len(title)))) def get_extension(link): extension = os.path.splitext(link)[1][1:] if extension in ['pdf', 'html']: return extension if 'pdf' in extension: - return 'pdf' - return 'pdf' + return 'pdf' + return 'pdf' + def shorten_title(title): m1 = re.search('[[0-9]*]', title) @@ -62,16 +69,16 @@ def shorten_title(title): if m1: title = m1.group(0) if m2: - title = ' '.join((title, m2.group(0))) - return title[:50] + ' [...]' + title = ' '.join((title, m2.group(0))) + return title[:50] + ' [...]' if __name__ == '__main__': - parser = argparse.ArgumentParser(description = 'Download all the PDF/HTML links into README.md') + parser = argparse.ArgumentParser(description='Download all the PDF/HTML links into README.md') parser.add_argument('-d', action="store", dest="directory") - parser.add_argument('--no-html', action="store_true", dest="nohtml", default = False) - parser.add_argument('--overwrite', action="store_true", default = False) + parser.add_argument('--no-html', action="store_true", dest="nohtml", default=False) + parser.add_argument('--overwrite', action="store_true", default=False) results = parser.parse_args() output_directory = 'pdfs' if results.directory is None else results.directory @@ -81,8 +88,8 @@ def shorten_title(title): if results.overwrite and os.path.exists(output_directory): shutil.rmtree(output_directory) - with open('README.md') as readme: - readme_html = mistune.markdown(readme.read()) + with open('README.md', 'rb') as readme: + readme_html = mistune.markdown(readme.read().decode('utf-8')) readme_soup = BeautifulSoup.BeautifulSoup(readme_html, "html.parser") point = readme_soup.find_all('h1')[1] @@ -95,7 +102,7 @@ def shorten_title(title): h1_directory = os.path.join(output_directory, clean_text(point.text)) current_directory = h1_directory elif point.name == 'h2': - current_directory = os.path.join(h1_directory, clean_text(point.text)) + current_directory = os.path.join(h1_directory, clean_text(point.text)) if not os.path.exists(current_directory): os.makedirs(current_directory) print_title(point.text) @@ -111,7 +118,7 @@ def shorten_title(title): try: name = clean_text(point.text.split('[' + ext + ']')[0]) fullname = '.'.join((name, ext)) - if not os.path.exists('/'.join((current_directory, fullname)) ): + if not os.path.exists('/'.join((current_directory, fullname))): download_pdf(link, current_directory, '.'.join((name, ext))) except KeyboardInterrupt: try: @@ -122,8 +129,8 @@ def shorten_title(title): break except: failures.append(point.text) - - point = point.next_sibling + + point = point.next_sibling print('Done!') if failures: From 9ee93a4d09ba292c083e1f00611d316f02fec295 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E9=98=B3?= <1070942929@qq.com> Date: Sun, 17 Dec 2017 23:40:31 +0800 Subject: [PATCH 2/2] ignore ./idea --- .idea/vcs.xml | 6 ------ 1 file changed, 6 deletions(-) delete mode 100644 .idea/vcs.xml diff --git a/.idea/vcs.xml b/.idea/vcs.xml deleted file mode 100644 index 94a25f7..0000000 --- a/.idea/vcs.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - - - \ No newline at end of file