-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathloggen_create_urls.py
More file actions
86 lines (74 loc) · 2.58 KB
/
loggen_create_urls.py
File metadata and controls
86 lines (74 loc) · 2.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import requests
from bs4 import BeautifulSoup
import pandas as pd
import sys
import getopt
import argparse
import lxml
def main(argv):
url = ''
cmstype = ''
inputfile = ''
outputfile = ''
try:
parser = argparse.ArgumentParser(description="Helper function to create an input file for the url crawler")
parser.add_argument("url")
parser.add_argument("outputfile")
parser.add_argument("cmstype")
args=parser.parse_args()
url=args.url
outputfile=args.outputfile
cmstype=args.cmstype
except getopt.GetoptError:
print ('loggen_create_urls.py <url> <outputfile> <basic>, <sitemapped> or <wordpress>')
sys.exit(2)
print ('Url to crawl is "', url)
print ('Output file is "', outputfile)
print ('cmstype file is "', cmstype)
if cmstype != "basic":
extract_links(url, outputfile,cmstype)
else:
extract_basic_links(url, outputfile)
links = []
#df = pd.DataFrame({"links":links})
def extract_links(url, outputfile, cmstype):
print(url, cmstype)
global links
if cmstype == "wordpress":
linkxml = url + "/wp-sitemap.xml"
if cmstype == "sitemapped":
linkxml = url + "/sitemap.xml"
print(linkxml)
"""output_file = outputfile"""
source_url = requests.get(linkxml)
with open(outputfile, 'a+') as output_file:
soup = BeautifulSoup(source_url.text, 'xml')
for item in soup.find_all('loc'):
try:
if '.xml' in item.text:
#Send another GET request to the .xml link
r = requests.get(item.text)
new_soup = BeautifulSoup(r.text, 'xml')
for new_item in new_soup.findAll('loc', recursive=True):
output_file.write(new_item.text + "\n")
print(new_item.text)
#If the link doesn't have a .xml extension, add it to the list
else:
output_file.write(item.text + "\n")
print(item.text)
except TypeError:
pass
def extract_basic_links(url, outputfile):
global links
source_url = requests.get(url)
with open(outputfile, 'a+') as output_file:
soup = BeautifulSoup(source_url.content,'html.parser')
#print(soup)
a_tags=soup.find_all('a',href=True, recursive=True)
for a_tag in a_tags:
if str(url) not in a_tags:
output_file.write(a_tag.get('href') + "\n")
print(a_tag.get('href'))
output_file.close()
if __name__ == "__main__":
main(sys.argv[1:])