From 5d2a2729c60dea5d7464b5bb9e6625871dca1f98 Mon Sep 17 00:00:00 2001 From: Kamo Petrosyan Date: Fri, 10 Jul 2015 01:25:12 +0600 Subject: [PATCH] 0.3.6 --- pysitemap/crawler.py | 102 ++++++++++++++++--------------------------- setup.py | 2 +- 2 files changed, 39 insertions(+), 65 deletions(-) diff --git a/pysitemap/crawler.py b/pysitemap/crawler.py index 9ad8d97..049d141 100644 --- a/pysitemap/crawler.py +++ b/pysitemap/crawler.py @@ -1,13 +1,16 @@ +# ~*~ coding: utf-8 ~*~ +__author__ = 'Kamo Petrosyan' + import urllib from bs4 import BeautifulSoup import urlparse import mechanize import pickle import re -try: +try: import sys import gevent - from gevent import monkey, pool, queue + from gevent import monkey, pool monkey.patch_all() if 'threading' in sys.modules: del sys.modules['threading'] @@ -28,8 +31,7 @@ class Crawler: # create lists for the urls in que and visited urls self.urls = set([url]) - self.visited = set([]) - self.excepted = set([]) + self.visited = set([url]) self.exts = ['htm', 'php'] self.allowed_regex = '\.((?!htm)(?!php)\w+)$' @@ -50,77 +52,51 @@ class Crawler: self.regex = re.compile(self.allowed_regex) if gevent_installed and pool_size > 1: self.pool = pool.Pool(pool_size) - self.queue = gevent.queue.Queue() - self.queue.put(self.url) self.pool.spawn(self.parse_gevent) - while not self.queue.empty() and not self.pool.free_count() == pool_size: - gevent.sleep(0.1) - while len(self.urls) > 0: - self.queue.put(self.urls.pop()) - for x in xrange(0, min(self.queue.qsize(), self.pool.free_count())): - self.pool.spawn(self.parse_gevent) self.pool.join() else: while len(self.urls) > 0: self.parse() if self.oformat == 'xml': self.write_xml() - self.errlog() def parse_gevent(self): - url = self.queue.get(timeout=0) - try: - br = mechanize.Browser() - response = br.open(url) - if response.code >= 400: - self.excepted.update([(url, "Error {} at url {}".format(response.code, url))]) - return - - for link in br.links(): - newurl = urlparse.urljoin(link.base_url, link.url) - if self.is_valid(newurl): - self.visited.update([newurl]) - self.urls.update([newurl]) - except Exception, e: - self.excepted.update([(url, e.message)]) - return - except gevent.queue.Empty: - br.close() - if self.echo: - print('{} pages parsed :: {} parsing processes :: {} pages in the queue'.format(len(self.visited), len(self.pool), self.queue.qsize())) - return - - br.close() - if self.echo: - print('{} pages parsed :: {} parsing processes :: {} pages in the queue'.format(len(self.visited), len(self.pool), self.queue.qsize())) - + self.parse() + while len(self.urls) > 0 and not self.pool.full(): + self.pool.spawn(self.parse_gevent) def parse(self): if self.echo: - print('{} pages parsed :: {} pages in the queue'.format(len(self.visited), len(self.urls))) + if not gevent_installed: + print('{} pages parsed :: {} pages in the queue'.format(len(self.visited), len(self.urls))) + else: + print('{} pages parsed :: {} parsing processes :: {} pages in the queue'.format(len(self.visited), len(self.pool), len(self.urls))) # Set the startingpoint for the spider and initialize # the a mechanize browser object - url = self.urls.pop() - br = mechanize.Browser() - try: - response = br.open(url) - if response.code >= 400: - self.excepted.update([(url, "Error {} at url {}".format(response.code, url))]) - return - - for link in br.links(): - newurl = urlparse.urljoin(link.base_url, link.url) - #print newurl - if self.is_valid(newurl): - self.urls.update([newurl]) - self.visited.update([url]) - except Exception, e: - self.excepted.update([(url, e.message)]) + if not self.urls: + return + else: + url = self.urls.pop() + br = mechanize.Browser() + try: + response = br.open(url) + if response.code >= 400: + self.errlog("Error {} at url {}".format(response.code, url)) + return + + for link in br.links(): + newurl = urlparse.urljoin(link.base_url, link.url) + #print newurl + if self.is_valid(newurl): + self.visited.update([newurl]) + self.urls.update([newurl]) + except Exception, e: + self.errlog(e.message) - br.close() - del(br) + br.close() + del(br) @@ -134,11 +110,9 @@ class Crawler: return False return True - def errlog(self): - while len(self.excepted) > 0: - ex = self.excepted.pop() - self.logfile.write('{}\n'.format('\t'.join(ex))) - self.logfile.close() + def errlog(self, msg): + self.logfile.write(msg) + self.logfile.write('\n') def write_xml(self): of = open(self.outputfile, 'w') @@ -149,4 +123,4 @@ class Crawler: of.write(url_str.format(self.visited.pop())) of.write('') - of.close() + of.close() \ No newline at end of file diff --git a/setup.py b/setup.py index d0d96de..642d82c 100644 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ setup( version=get_version( major=0, minor=3, - build=5, + build=6, ), packages=find_packages(exclude=EXCLUDE_FROM_PACKAGES), include_package_data=True,