diff --git a/README.md b/README.md index b6f72c6..d6e1b73 100644 --- a/README.md +++ b/README.md @@ -16,10 +16,10 @@ Sitemap-generator uses [gevent](http://www.gevent.org/) to implement multiproces import pysitemap - if __name__=='__main__': - url = 'http://www.example.com/' # url from to crawl - logfile = 'errlog.log' # path to logfile - oformat = 'xml' # output format + if __name__ == '__main__': + url = 'http://www.example.com/' # url from to crawl + logfile = 'errlog.log' # path to logfile + oformat = 'xml' # output format crawl = pysitemap.Crawler(url=url, logfile=logfile, oformat=oformat) crawl.crawl() @@ -30,9 +30,9 @@ Sitemap-generator uses [gevent](http://www.gevent.org/) to implement multiproces import pysitemap - if __name__=='__main__': - url = 'http://www.example.com/' # url from to crawl - logfile = 'errlog.log' # path to logfile - oformat = 'xml' # output format + if __name__ == '__main__': + url = 'http://www.example.com/' # url from to crawl + logfile = 'errlog.log' # path to logfile + oformat = 'xml' # output format crawl = pysitemap.Crawler(url=url, logfile=logfile, oformat=oformat) - crawl.crawl(pool_size=10) # 10 parsing processes + crawl.crawl(pool_size=10) # 10 parsing processes diff --git a/pysitemap/crawler.py b/pysitemap/crawler.py index c48a7bd..11b11a4 100644 --- a/pysitemap/crawler.py +++ b/pysitemap/crawler.py @@ -1,16 +1,12 @@ -import urllib -from bs4 import BeautifulSoup import urlparse import mechanize -import pickle import re -try: +try: import sys if 'threading' in sys.modules: del sys.modules['threading'] print('threading module loaded before patching!') print('threading module deleted from sys.modules!\n') - import gevent from gevent import monkey, pool monkey.patch_all() gevent_installed = True @@ -36,7 +32,7 @@ class Crawler: self.exts = exts def allow_regex(self, regex=None): - if not regex is None: + if regex is not None: self.allowed_regex = regex else: allowed_regex = '' @@ -71,7 +67,7 @@ class Crawler: else: print('{} pages parsed :: {} parsing processes :: {} pages in the queue'.format(len(self.visited), len(self.pool), len(self.urls))) - # Set the startingpoint for the spider and initialize + # Set the startingpoint for the spider and initialize # the a mechanize browser object if not self.urls: @@ -84,10 +80,10 @@ class Crawler: if response.code >= 400: self.errlog("Error {} at url {}".format(response.code, url)) return - + for link in br.links(): - newurl = urlparse.urljoin(link.base_url, link.url) - #print newurl + newurl = urlparse.urljoin(link.base_url, link.url) + # print(newurl) if self.is_valid(newurl): self.visited.update([newurl]) self.urls.update([newurl]) @@ -97,15 +93,12 @@ class Crawler: br.close() del(br) - - def is_valid(self, url): - valid = False if '#' in url: url = url[:url.find('#')] if url in self.visited: return False - if not self.url in url: + if self.url not in url: return False if re.search(self.regex, url): return False @@ -132,4 +125,4 @@ class Crawler: while self.visited: of.write(url_str.format(self.visited.pop())) - of.close() \ No newline at end of file + of.close() diff --git a/run.py b/run.py index 85cbe14..82d8947 100644 --- a/run.py +++ b/run.py @@ -7,10 +7,10 @@ To install gevent: $ pip install gevent """ -if __name__=='__main__': - url = 'http://www.example.com/' # url from to crawl - logfile = 'errlog.log' # path to logfile - oformat = 'xml' # output format - outputfile = 'sitemap.xml' # path to output file +if __name__ == '__main__': + url = 'http://www.example.com/' # url from to crawl + logfile = 'errlog.log' # path to logfile + oformat = 'xml' # output format + outputfile = 'sitemap.xml' # path to output file crawl = pysitemap.Crawler(url=url, logfile=logfile, oformat=oformat, outputfile=outputfile) crawl.crawl(pool_size=20)