diff --git a/pysitemap/crawler.py b/pysitemap/crawler.py index 9a07920..9ad8d97 100644 --- a/pysitemap/crawler.py +++ b/pysitemap/crawler.py @@ -4,10 +4,15 @@ import urlparse import mechanize import pickle import re -try: +try: + import sys import gevent - from gevent import monkey, pool + from gevent import monkey, pool, queue monkey.patch_all() + if 'threading' in sys.modules: + del sys.modules['threading'] + print('threading module loaded before patching!\n') + print('threading module deleted from sys.modules!\n') gevent_installed = True except: print("Gevent does not installed. Parsing process will be slower.") @@ -23,7 +28,8 @@ class Crawler: # create lists for the urls in que and visited urls self.urls = set([url]) - self.visited = set([url]) + self.visited = set([]) + self.excepted = set([]) self.exts = ['htm', 'php'] self.allowed_regex = '\.((?!htm)(?!php)\w+)$' @@ -44,51 +50,77 @@ class Crawler: self.regex = re.compile(self.allowed_regex) if gevent_installed and pool_size > 1: self.pool = pool.Pool(pool_size) + self.queue = gevent.queue.Queue() + self.queue.put(self.url) self.pool.spawn(self.parse_gevent) + while not self.queue.empty() and not self.pool.free_count() == pool_size: + gevent.sleep(0.1) + while len(self.urls) > 0: + self.queue.put(self.urls.pop()) + for x in xrange(0, min(self.queue.qsize(), self.pool.free_count())): + self.pool.spawn(self.parse_gevent) self.pool.join() else: while len(self.urls) > 0: self.parse() if self.oformat == 'xml': self.write_xml() + self.errlog() def parse_gevent(self): - self.parse() - while len(self.urls) > 0 and not self.pool.full(): - self.pool.spawn(self.parse_gevent) + url = self.queue.get(timeout=0) + try: + br = mechanize.Browser() + response = br.open(url) + if response.code >= 400: + self.excepted.update([(url, "Error {} at url {}".format(response.code, url))]) + return + + for link in br.links(): + newurl = urlparse.urljoin(link.base_url, link.url) + if self.is_valid(newurl): + self.visited.update([newurl]) + self.urls.update([newurl]) + except Exception, e: + self.excepted.update([(url, e.message)]) + return + except gevent.queue.Empty: + br.close() + if self.echo: + print('{} pages parsed :: {} parsing processes :: {} pages in the queue'.format(len(self.visited), len(self.pool), self.queue.qsize())) + return + + br.close() + if self.echo: + print('{} pages parsed :: {} parsing processes :: {} pages in the queue'.format(len(self.visited), len(self.pool), self.queue.qsize())) + def parse(self): if self.echo: - if not gevent_installed: - print('{} pages parsed :: {} pages in the queue'.format(len(self.visited), len(self.urls))) - else: - print('{} pages parsed :: {} parsing processes :: {} pages in the queue'.format(len(self.visited), len(self.pool), len(self.urls))) + print('{} pages parsed :: {} pages in the queue'.format(len(self.visited), len(self.urls))) # Set the startingpoint for the spider and initialize # the a mechanize browser object + url = self.urls.pop() + br = mechanize.Browser() + try: + response = br.open(url) + if response.code >= 400: + self.excepted.update([(url, "Error {} at url {}".format(response.code, url))]) + return + + for link in br.links(): + newurl = urlparse.urljoin(link.base_url, link.url) + #print newurl + if self.is_valid(newurl): + self.urls.update([newurl]) - if not self.urls: - return - else: - url = self.urls.pop() - br = mechanize.Browser() - try: - response = br.open(url) - if response.code >= 400: - self.errlog("Error {} at url {}".format(response.code, url)) - return - - for link in br.links(): - newurl = urlparse.urljoin(link.base_url, link.url) - #print newurl - if self.is_valid(newurl): - self.visited.update([newurl]) - self.urls.update([newurl]) - except Exception, e: - self.errlog(e.message) + self.visited.update([url]) + except Exception, e: + self.excepted.update([(url, e.message)]) - br.close() - del(br) + br.close() + del(br) @@ -102,9 +134,11 @@ class Crawler: return False return True - def errlog(self, msg): - self.logfile.write(msg) - self.logfile.write('\n') + def errlog(self): + while len(self.excepted) > 0: + ex = self.excepted.pop() + self.logfile.write('{}\n'.format('\t'.join(ex))) + self.logfile.close() def write_xml(self): of = open(self.outputfile, 'w') @@ -116,7 +150,3 @@ class Crawler: of.write('') of.close() - - - - diff --git a/setup.py b/setup.py index b5f1759..5d0bc34 100644 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ setup( version=get_version( major=0, minor=3, - build=4, + build=5, ), packages=find_packages(exclude=EXCLUDE_FROM_PACKAGES), include_package_data=True,