Browse Source

Code cleanup

pysitemap-python-2.7
Adam Taylor 8 years ago
parent
commit
3ab04f6da0
3 changed files with 22 additions and 29 deletions
  1. +9
    -9
      README.md
  2. +8
    -15
      pysitemap/crawler.py
  3. +5
    -5
      run.py

+ 9
- 9
README.md View File

@ -16,10 +16,10 @@ Sitemap-generator uses [gevent](http://www.gevent.org/) to implement multiproces
import pysitemap import pysitemap
if __name__=='__main__':
url = 'http://www.example.com/' # url from to crawl
logfile = 'errlog.log' # path to logfile
oformat = 'xml' # output format
if __name__ == '__main__':
url = 'http://www.example.com/' # url from to crawl
logfile = 'errlog.log' # path to logfile
oformat = 'xml' # output format
crawl = pysitemap.Crawler(url=url, logfile=logfile, oformat=oformat) crawl = pysitemap.Crawler(url=url, logfile=logfile, oformat=oformat)
crawl.crawl() crawl.crawl()
@ -30,9 +30,9 @@ Sitemap-generator uses [gevent](http://www.gevent.org/) to implement multiproces
import pysitemap import pysitemap
if __name__=='__main__':
url = 'http://www.example.com/' # url from to crawl
logfile = 'errlog.log' # path to logfile
oformat = 'xml' # output format
if __name__ == '__main__':
url = 'http://www.example.com/' # url from to crawl
logfile = 'errlog.log' # path to logfile
oformat = 'xml' # output format
crawl = pysitemap.Crawler(url=url, logfile=logfile, oformat=oformat) crawl = pysitemap.Crawler(url=url, logfile=logfile, oformat=oformat)
crawl.crawl(pool_size=10) # 10 parsing processes
crawl.crawl(pool_size=10) # 10 parsing processes

+ 8
- 15
pysitemap/crawler.py View File

@ -1,16 +1,12 @@
import urllib
from bs4 import BeautifulSoup
import urlparse import urlparse
import mechanize import mechanize
import pickle
import re import re
try:
try:
import sys import sys
if 'threading' in sys.modules: if 'threading' in sys.modules:
del sys.modules['threading'] del sys.modules['threading']
print('threading module loaded before patching!') print('threading module loaded before patching!')
print('threading module deleted from sys.modules!\n') print('threading module deleted from sys.modules!\n')
import gevent
from gevent import monkey, pool from gevent import monkey, pool
monkey.patch_all() monkey.patch_all()
gevent_installed = True gevent_installed = True
@ -36,7 +32,7 @@ class Crawler:
self.exts = exts self.exts = exts
def allow_regex(self, regex=None): def allow_regex(self, regex=None):
if not regex is None:
if regex is not None:
self.allowed_regex = regex self.allowed_regex = regex
else: else:
allowed_regex = '' allowed_regex = ''
@ -71,7 +67,7 @@ class Crawler:
else: else:
print('{} pages parsed :: {} parsing processes :: {} pages in the queue'.format(len(self.visited), len(self.pool), len(self.urls))) print('{} pages parsed :: {} parsing processes :: {} pages in the queue'.format(len(self.visited), len(self.pool), len(self.urls)))
# Set the startingpoint for the spider and initialize
# Set the startingpoint for the spider and initialize
# the a mechanize browser object # the a mechanize browser object
if not self.urls: if not self.urls:
@ -84,10 +80,10 @@ class Crawler:
if response.code >= 400: if response.code >= 400:
self.errlog("Error {} at url {}".format(response.code, url)) self.errlog("Error {} at url {}".format(response.code, url))
return return
for link in br.links(): for link in br.links():
newurl = urlparse.urljoin(link.base_url, link.url)
#print newurl
newurl = urlparse.urljoin(link.base_url, link.url)
# print(newurl)
if self.is_valid(newurl): if self.is_valid(newurl):
self.visited.update([newurl]) self.visited.update([newurl])
self.urls.update([newurl]) self.urls.update([newurl])
@ -97,15 +93,12 @@ class Crawler:
br.close() br.close()
del(br) del(br)
def is_valid(self, url): def is_valid(self, url):
valid = False
if '#' in url: if '#' in url:
url = url[:url.find('#')] url = url[:url.find('#')]
if url in self.visited: if url in self.visited:
return False return False
if not self.url in url:
if self.url not in url:
return False return False
if re.search(self.regex, url): if re.search(self.regex, url):
return False return False
@ -132,4 +125,4 @@ class Crawler:
while self.visited: while self.visited:
of.write(url_str.format(self.visited.pop())) of.write(url_str.format(self.visited.pop()))
of.close()
of.close()

+ 5
- 5
run.py View File

@ -7,10 +7,10 @@ To install gevent:
$ pip install gevent $ pip install gevent
""" """
if __name__=='__main__':
url = 'http://www.example.com/' # url from to crawl
logfile = 'errlog.log' # path to logfile
oformat = 'xml' # output format
outputfile = 'sitemap.xml' # path to output file
if __name__ == '__main__':
url = 'http://www.example.com/' # url from to crawl
logfile = 'errlog.log' # path to logfile
oformat = 'xml' # output format
outputfile = 'sitemap.xml' # path to output file
crawl = pysitemap.Crawler(url=url, logfile=logfile, oformat=oformat, outputfile=outputfile) crawl = pysitemap.Crawler(url=url, logfile=logfile, oformat=oformat, outputfile=outputfile)
crawl.crawl(pool_size=20) crawl.crawl(pool_size=20)

Loading…
Cancel
Save