request_spoo/loggen_create_urls.py at main · erwinfr/request_spoo · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import requests
from bs4 import BeautifulSoup
import pandas as pd
import sys
import getopt
import argparse
import lxml
def main(argv):
   url = ''
   cmstype = ''
   inputfile = ''
   outputfile = ''
   try:
      parser = argparse.ArgumentParser(description="Helper function to create an input file for the url crawler")
      parser.add_argument("url")
      parser.add_argument("outputfile")
      parser.add_argument("cmstype")
      args=parser.parse_args()
      url=args.url
      outputfile=args.outputfile
      cmstype=args.cmstype
   except getopt.GetoptError:
      print ('loggen_create_urls.py <url> <outputfile> <basic>, <sitemapped> or <wordpress>')
      sys.exit(2)

   print ('Url to crawl is "', url)
   print ('Output file is "', outputfile)
   print ('cmstype file is "', cmstype)
   if cmstype != "basic":
      extract_links(url, outputfile,cmstype)

   else:
      extract_basic_links(url, outputfile)


links = []
#df = pd.DataFrame({"links":links})
def extract_links(url, outputfile, cmstype):
   print(url, cmstype)
   global links
   if cmstype == "wordpress":
      linkxml = url + "/wp-sitemap.xml"
   if cmstype == "sitemapped":
      linkxml = url + "/sitemap.xml"
      print(linkxml)
   """output_file = outputfile"""
   source_url = requests.get(linkxml)


   with open(outputfile, 'a+') as output_file:
      soup = BeautifulSoup(source_url.text, 'xml')
      for item in soup.find_all('loc'):
          try:
            if '.xml' in item.text:
               #Send another GET request to the .xml link
               r = requests.get(item.text)
               new_soup = BeautifulSoup(r.text, 'xml')
               for new_item in new_soup.findAll('loc', recursive=True):
                  output_file.write(new_item.text  + "\n")
                  print(new_item.text)
            #If the link doesn't have a .xml extension, add it to the list
            else:
                  output_file.write(item.text  + "\n")
                  print(item.text)
          except TypeError:
               pass


def extract_basic_links(url, outputfile):
    global links
    source_url = requests.get(url)

    with open(outputfile, 'a+') as output_file:

         soup = BeautifulSoup(source_url.content,'html.parser')
         #print(soup)
         a_tags=soup.find_all('a',href=True, recursive=True)
         for a_tag in a_tags:
            if str(url) not in a_tags:
               output_file.write(a_tag.get('href') + "\n")
               print(a_tag.get('href'))
         output_file.close()


if __name__ == "__main__":
   main(sys.argv[1:])