From 2eee77d6eb804940928d211ddfc80106c85064d5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E9=98=B3?= <1070942929@qq.com>
Date: Thu, 7 Dec 2017 11:42:19 +0800
Subject: [PATCH 1/2] MacOS Python3.6.3 PyCharm
---
.gitignore | 3 +++
.idea/vcs.xml | 6 +++++
download.py | 63 ++++++++++++++++++++++++++++-----------------------
3 files changed, 44 insertions(+), 28 deletions(-)
create mode 100644 .gitignore
create mode 100644 .idea/vcs.xml
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..b2f5cb7
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,3 @@
+/.idea
+/pdfs
+**/.DS_Store
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 0000000..94a25f7
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/download.py b/download.py
index 1442527..d0ec812 100644
--- a/download.py
+++ b/download.py
@@ -3,7 +3,7 @@
import re
from six.moves.urllib.request import urlopen
from six.moves.urllib.error import HTTPError
-import urllib2
+import urllib
import shutil
import argparse
import mistune
@@ -13,48 +13,55 @@
import requests
# encoding=utf8
-import sys
+import sys
+
+
+# reload(sys)
+# sys.setdefaultencoding('utf8')
-reload(sys)
-sys.setdefaultencoding('utf8')
def download_pdf(link, location, name):
try:
response = requests.get(link)
with open(os.path.join(location, name), 'wb') as f:
- f.write(response.content)
- f.close()
+ f.write(response.content)
+ f.close()
except HTTPError:
- print('>>> Error 404: cannot be downloaded!\n')
- raise
+ print('>>> Error 404: cannot be downloaded!\n')
+ raise
except socket.timeout:
- print(" ".join(("can't download", link, "due to connection timeout!")) )
+ print(" ".join(("can't download", link, "due to connection timeout!")))
raise
+
def clean_pdf_link(link):
if 'arxiv' in link:
- link = link.replace('abs', 'pdf')
- if not(link.endswith('.pdf')):
+ link = link.replace('abs', 'pdf')
+ if not (link.endswith('.pdf')):
link = '.'.join((link, 'pdf'))
print(link)
return link
-def clean_text(text, replacements = {':': '_', ' ': '_', '/': '_', '.': '', '"': ''}):
+
+def clean_text(text, replacements={':': '_', ' ': '_', '/': '_', '.': '', '"': ''}):
for key, rep in replacements.items():
text = text.replace(key, rep)
- return text
+ return text
+
+
+def print_title(title, pattern="-"):
+ print('\n'.join(("", title, pattern * len(title))))
-def print_title(title, pattern = "-"):
- print('\n'.join(("", title, pattern * len(title))))
def get_extension(link):
extension = os.path.splitext(link)[1][1:]
if extension in ['pdf', 'html']:
return extension
if 'pdf' in extension:
- return 'pdf'
- return 'pdf'
+ return 'pdf'
+ return 'pdf'
+
def shorten_title(title):
m1 = re.search('[[0-9]*]', title)
@@ -62,16 +69,16 @@ def shorten_title(title):
if m1:
title = m1.group(0)
if m2:
- title = ' '.join((title, m2.group(0)))
- return title[:50] + ' [...]'
+ title = ' '.join((title, m2.group(0)))
+ return title[:50] + ' [...]'
if __name__ == '__main__':
- parser = argparse.ArgumentParser(description = 'Download all the PDF/HTML links into README.md')
+ parser = argparse.ArgumentParser(description='Download all the PDF/HTML links into README.md')
parser.add_argument('-d', action="store", dest="directory")
- parser.add_argument('--no-html', action="store_true", dest="nohtml", default = False)
- parser.add_argument('--overwrite', action="store_true", default = False)
+ parser.add_argument('--no-html', action="store_true", dest="nohtml", default=False)
+ parser.add_argument('--overwrite', action="store_true", default=False)
results = parser.parse_args()
output_directory = 'pdfs' if results.directory is None else results.directory
@@ -81,8 +88,8 @@ def shorten_title(title):
if results.overwrite and os.path.exists(output_directory):
shutil.rmtree(output_directory)
- with open('README.md') as readme:
- readme_html = mistune.markdown(readme.read())
+ with open('README.md', 'rb') as readme:
+ readme_html = mistune.markdown(readme.read().decode('utf-8'))
readme_soup = BeautifulSoup.BeautifulSoup(readme_html, "html.parser")
point = readme_soup.find_all('h1')[1]
@@ -95,7 +102,7 @@ def shorten_title(title):
h1_directory = os.path.join(output_directory, clean_text(point.text))
current_directory = h1_directory
elif point.name == 'h2':
- current_directory = os.path.join(h1_directory, clean_text(point.text))
+ current_directory = os.path.join(h1_directory, clean_text(point.text))
if not os.path.exists(current_directory):
os.makedirs(current_directory)
print_title(point.text)
@@ -111,7 +118,7 @@ def shorten_title(title):
try:
name = clean_text(point.text.split('[' + ext + ']')[0])
fullname = '.'.join((name, ext))
- if not os.path.exists('/'.join((current_directory, fullname)) ):
+ if not os.path.exists('/'.join((current_directory, fullname))):
download_pdf(link, current_directory, '.'.join((name, ext)))
except KeyboardInterrupt:
try:
@@ -122,8 +129,8 @@ def shorten_title(title):
break
except:
failures.append(point.text)
-
- point = point.next_sibling
+
+ point = point.next_sibling
print('Done!')
if failures:
From 9ee93a4d09ba292c083e1f00611d316f02fec295 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E9=98=B3?= <1070942929@qq.com>
Date: Sun, 17 Dec 2017 23:40:31 +0800
Subject: [PATCH 2/2] ignore ./idea
---
.idea/vcs.xml | 6 ------
1 file changed, 6 deletions(-)
delete mode 100644 .idea/vcs.xml
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
deleted file mode 100644
index 94a25f7..0000000
--- a/.idea/vcs.xml
+++ /dev/null
@@ -1,6 +0,0 @@
-
-
-
-
-
-
\ No newline at end of file