Browse Source

Code cleanup

pysitemap-python-2.7
Adam Taylor 7 years ago
parent
commit
3ab04f6da0
3 changed files with 22 additions and 29 deletions
  1. +9
    -9
      README.md
  2. +8
    -15
      pysitemap/crawler.py
  3. +5
    -5
      run.py

+ 9
- 9
README.md View File

@ -16,10 +16,10 @@ Sitemap-generator uses [gevent](http://www.gevent.org/) to implement multiproces
import pysitemap
if __name__=='__main__':
url = 'http://www.example.com/' # url from to crawl
logfile = 'errlog.log' # path to logfile
oformat = 'xml' # output format
if __name__ == '__main__':
url = 'http://www.example.com/' # url from to crawl
logfile = 'errlog.log' # path to logfile
oformat = 'xml' # output format
crawl = pysitemap.Crawler(url=url, logfile=logfile, oformat=oformat)
crawl.crawl()
@ -30,9 +30,9 @@ Sitemap-generator uses [gevent](http://www.gevent.org/) to implement multiproces
import pysitemap
if __name__=='__main__':
url = 'http://www.example.com/' # url from to crawl
logfile = 'errlog.log' # path to logfile
oformat = 'xml' # output format
if __name__ == '__main__':
url = 'http://www.example.com/' # url from to crawl
logfile = 'errlog.log' # path to logfile
oformat = 'xml' # output format
crawl = pysitemap.Crawler(url=url, logfile=logfile, oformat=oformat)
crawl.crawl(pool_size=10) # 10 parsing processes
crawl.crawl(pool_size=10) # 10 parsing processes

+ 8
- 15
pysitemap/crawler.py View File

@ -1,16 +1,12 @@
import urllib
from bs4 import BeautifulSoup
import urlparse
import mechanize
import pickle
import re
try:
try:
import sys
if 'threading' in sys.modules:
del sys.modules['threading']
print('threading module loaded before patching!')
print('threading module deleted from sys.modules!\n')
import gevent
from gevent import monkey, pool
monkey.patch_all()
gevent_installed = True
@ -36,7 +32,7 @@ class Crawler:
self.exts = exts
def allow_regex(self, regex=None):
if not regex is None:
if regex is not None:
self.allowed_regex = regex
else:
allowed_regex = ''
@ -71,7 +67,7 @@ class Crawler:
else:
print('{} pages parsed :: {} parsing processes :: {} pages in the queue'.format(len(self.visited), len(self.pool), len(self.urls)))
# Set the startingpoint for the spider and initialize
# Set the startingpoint for the spider and initialize
# the a mechanize browser object
if not self.urls:
@ -84,10 +80,10 @@ class Crawler:
if response.code >= 400:
self.errlog("Error {} at url {}".format(response.code, url))
return
for link in br.links():
newurl = urlparse.urljoin(link.base_url, link.url)
#print newurl
newurl = urlparse.urljoin(link.base_url, link.url)
# print(newurl)
if self.is_valid(newurl):
self.visited.update([newurl])
self.urls.update([newurl])
@ -97,15 +93,12 @@ class Crawler:
br.close()
del(br)
def is_valid(self, url):
valid = False
if '#' in url:
url = url[:url.find('#')]
if url in self.visited:
return False
if not self.url in url:
if self.url not in url:
return False
if re.search(self.regex, url):
return False
@ -132,4 +125,4 @@ class Crawler:
while self.visited:
of.write(url_str.format(self.visited.pop()))
of.close()
of.close()

+ 5
- 5
run.py View File

@ -7,10 +7,10 @@ To install gevent:
$ pip install gevent
"""
if __name__=='__main__':
url = 'http://www.example.com/' # url from to crawl
logfile = 'errlog.log' # path to logfile
oformat = 'xml' # output format
outputfile = 'sitemap.xml' # path to output file
if __name__ == '__main__':
url = 'http://www.example.com/' # url from to crawl
logfile = 'errlog.log' # path to logfile
oformat = 'xml' # output format
outputfile = 'sitemap.xml' # path to output file
crawl = pysitemap.Crawler(url=url, logfile=logfile, oformat=oformat, outputfile=outputfile)
crawl.crawl(pool_size=20)

Loading…
Cancel
Save