|
|
- import urllib
- from bs4 import BeautifulSoup
- import urlparse
- import mechanize
- import pickle
- import re
-
-
-
- class Crawler:
- def __init__(self, url, outputfile='sitemap.xml', logfile='error.log', oformat='xml'):
- # Set the startingpoint for the spider and initialize
- # the a mechanize browser object
- self.url = url
- self.br = mechanize.Browser()
- self.logfile = open(logfile, 'a')
- self.oformat = oformat
- self.outputfile = outputfile
-
- # create lists for the urls in que and visited urls
- self.urls = [url]
- self.visited = [url]
- self.excepted = []
- self.exts = ['htm', 'php']
- self.allowed_regex = '(\w+)\.((?!htm)(?!rar)\w+)$'
-
- def set_exts(self, exts):
- self.exts = exts
-
- def allow_regex(self, regex=None):
- if not regex is None:
- self.allowed_regex = regex
- else:
- allowed_regex = ''
- for ext in self.exts:
- allowed_regex += '(!{})'.format(ext)
- self.allowed_regex = '(\w+)\.({}\w+)$'.format(allowed_regex)
-
- def crawl(self):
- self.regex = re.compile(self.allowed_regex)
- while len(self.urls)>0:
- try:
- self.br.open(self.urls[0])
- for link in self.br.links():
- newurl = urlparse.urljoin(link.base_url,link.url)
- #print newurl
- if self.is_valid(newurl):
- self.visited.append(newurl)
- self.urls.append(newurl)
- except Exception, e:
- self.errlog(e.message)
-
- self.urls.pop(0)
-
- if self.oformat == 'xml':
- self.write_xml()
-
-
- def is_valid(self, url):
- valid = False
- if url in self.visited and not url in self.excepted:
- return False
- if not self.url in url:
- return False
- if re.search(self.regex, url):
- return False
- return True
-
- def errlog(self, msg):
- self.logfile.write(msg)
- self.logfile.write('\n')
-
- def write_xml(self):
- of = open(self.outputfile, 'w')
- of.write('<?xml version="1.0" encoding="utf-8"?><!--Generated by Screaming Frog SEO Spider 2,55-->\n')
- of.write('<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd">\n')
- url_str = '<url><loc>{}</loc></url>\n'
- for url in self.visited:
- of.write(url_str.format(url))
-
- of.write('</urlset>')
- of.close()
-
-
|