-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtest_code.py
59 lines (56 loc) · 2.48 KB
/
test_code.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import logging
##from datamodel.search.datamodel import ProducedLink, OneUnProcessedGroup, robot_manager
##from spacetime_local.IApplication import IApplication
##from spacetime_local.declarations import Producer, GetterSetter, Getter
from lxml import html,etree,cssselect
import re, os
from time import time
from bs4 import BeautifulSoup
try:
# For python 2
from urlparse import urlparse, parse_qs ,urljoin
except ImportError:
# For python 3
from urllib.parse import urlparse, parse_qs
outputLinks = list()
outputLinks1 = list()
## for raw in rawDatas:
raw_content = "http://www.ics.uci.edu/~fowlkes/bioshape/batvis/Viewer/index.html"
soup = BeautifulSoup(raw_content,"lxml")
## filename6.write(raw.url)
links = soup.find_all('a')
for tag in links:
link = tag.get('href')
if(link != None):
if(link.startswith("http://") or link.startswith("https://")):
outputLinks.append(link)
print "http"
print link
## filename1.write(link)
elif(link.startswith("#") or link.startswith("mailto") or link.startswith("javascript")):
print "# JS" + link
## filename2.write(link)
pass
elif(link.startswith("/")):
url = urlparse(link)
if re.match(".*\.(asp|aspx|axd|asx|asmx|ashx|css|cfm|yaws|swf|html|htm|xhtml" \
+ "|jhtmljsp|jspx|wss|do|action|js|pl|php|php4|php3|phtml|py|rb|rhtml|shtml|xml|rss|svg|cgi|dll|whl|exe|pdf|php|)$",url.path.lower()):
index = url.path.rfind("/")
parent = url.path[:index]
link = url.scheme + "://" + url.netloc + parent + link
print "html "+link
## filename3.write(link)
outputLinks.append(link)
else:
link = url.scheme + "://" + url.netloc + url.path.rstrip('/')+link
print "Append_join" + link
## filename4.write(link)
outputLinks.append(link)
else:
url = urlparse(link)
base = url.scheme+"://" + url.netloc + url.path
final_url = urljoin(base,link)
print "Else" + link
outputLinks.append(final_url)
## filename5.write(link)
## return outputLinks1