|
|
@ -4,15 +4,19 @@ import urlparse |
|
|
|
import mechanize |
|
|
|
import pickle |
|
|
|
import re |
|
|
|
|
|
|
|
try: |
|
|
|
import gevent |
|
|
|
from gevent import monkey, pool |
|
|
|
monkey.patch_all() |
|
|
|
gevent_installed = True |
|
|
|
except: |
|
|
|
print("Gevent does not installed. Parsing process will be slower.") |
|
|
|
gevent_installed = False |
|
|
|
|
|
|
|
|
|
|
|
class Crawler: |
|
|
|
def __init__(self, url, outputfile='sitemap.xml', logfile='error.log', oformat='xml'): |
|
|
|
# Set the startingpoint for the spider and initialize |
|
|
|
# the a mechanize browser object |
|
|
|
self.url = url |
|
|
|
self.br = mechanize.Browser() |
|
|
|
self.logfile = open(logfile, 'a') |
|
|
|
self.oformat = oformat |
|
|
|
self.outputfile = outputfile |
|
|
@ -35,14 +39,47 @@ class Crawler: |
|
|
|
allowed_regex += '(!{})'.format(ext) |
|
|
|
self.allowed_regex = '\.({}\w+)$'.format(allowed_regex) |
|
|
|
|
|
|
|
def crawl(self): |
|
|
|
def crawl(self, echo=False, pool_size=1): |
|
|
|
self.echo = echo |
|
|
|
self.regex = re.compile(self.allowed_regex) |
|
|
|
while len(self.urls)>0: |
|
|
|
if gevent_installed and pool_size > 1: |
|
|
|
self.pool = pool.Pool(pool_size) |
|
|
|
self.pool.spawn(self.parse_gevent) |
|
|
|
self.pool.join() |
|
|
|
else: |
|
|
|
while len(self.urls) > 0: |
|
|
|
self.parse() |
|
|
|
if self.oformat == 'xml': |
|
|
|
self.write_xml() |
|
|
|
|
|
|
|
def parse_gevent(self): |
|
|
|
self.parse() |
|
|
|
while len(self.urls) > 0 and not self.pool.full(): |
|
|
|
self.pool.spawn(self.parse_gevent) |
|
|
|
|
|
|
|
def parse(self): |
|
|
|
if self.echo: |
|
|
|
if not gevent_installed: |
|
|
|
print('{} pages parsed :: {} pages in the queue'.format(len(self.visited), len(self.urls))) |
|
|
|
else: |
|
|
|
print('{} pages parsed :: {} parsing processes :: {} pages in the queue'.format(len(self.visited), len(self.pool), len(self.urls))) |
|
|
|
|
|
|
|
# Set the startingpoint for the spider and initialize |
|
|
|
# the a mechanize browser object |
|
|
|
|
|
|
|
if not self.urls: |
|
|
|
return |
|
|
|
else: |
|
|
|
url = self.urls.pop() |
|
|
|
br = mechanize.Browser() |
|
|
|
try: |
|
|
|
url = self.urls.pop() |
|
|
|
self.br.open(url) |
|
|
|
for link in self.br.links(): |
|
|
|
newurl = urlparse.urljoin(link.base_url,link.url) |
|
|
|
response = br.open(url) |
|
|
|
if response.code >= 400: |
|
|
|
self.errlog("Error {} at url {}".format(response.code, url)) |
|
|
|
return |
|
|
|
|
|
|
|
for link in br.links(): |
|
|
|
newurl = urlparse.urljoin(link.base_url, link.url) |
|
|
|
#print newurl |
|
|
|
if self.is_valid(newurl): |
|
|
|
self.visited.update([newurl]) |
|
|
@ -50,8 +87,9 @@ class Crawler: |
|
|
|
except Exception, e: |
|
|
|
self.errlog(e.message) |
|
|
|
|
|
|
|
if self.oformat == 'xml': |
|
|
|
self.write_xml() |
|
|
|
br.close() |
|
|
|
del(br) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def is_valid(self, url): |
|
|
@ -80,3 +118,5 @@ class Crawler: |
|
|
|
of.close() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|