Browse Source

Multiprocessing version 0.3.4

pysitemap-python-2.7
Kamo Petrosyan 9 years ago
parent
commit
c8e3c70224
5 changed files with 1186 additions and 28 deletions
  1. +21
    -1
      README.md
  2. +52
    -12
      pysitemap/crawler.py
  3. +3
    -13
      run.py
  4. +2
    -2
      setup.py
  5. +1108
    -0
      sitemap.xml

+ 21
- 1
README.md View File

@ -5,6 +5,12 @@ Sitemap generator
pip install sitemap-generator pip install sitemap-generator
## Gevent
Sitemap-generator uses gevent to implement multiprocessing. Install gevent:
pip install gevent
## example ## example
import pysitemap import pysitemap
@ -16,4 +22,18 @@ Sitemap generator
oformat = 'xml' # output format oformat = 'xml' # output format
crawl = pysitemap.Crawler(url=url, logfile=logfile, oformat=oformat) crawl = pysitemap.Crawler(url=url, logfile=logfile, oformat=oformat)
crawl.crawl() crawl.crawl()
## multiprocessing example
import pysitemap
if __name__=='__main__':
url = 'http://www.example.com/' # url from to crawl
logfile = 'errlog.log' # path to logfile
oformat = 'xml' # output format
crawl = pysitemap.Crawler(url=url, logfile=logfile, oformat=oformat)
crawl.crawl()

+ 52
- 12
pysitemap/crawler.py View File

@ -4,15 +4,19 @@ import urlparse
import mechanize import mechanize
import pickle import pickle
import re import re
try:
import gevent
from gevent import monkey, pool
monkey.patch_all()
gevent_installed = True
except:
print("Gevent does not installed. Parsing process will be slower.")
gevent_installed = False
class Crawler: class Crawler:
def __init__(self, url, outputfile='sitemap.xml', logfile='error.log', oformat='xml'): def __init__(self, url, outputfile='sitemap.xml', logfile='error.log', oformat='xml'):
# Set the startingpoint for the spider and initialize
# the a mechanize browser object
self.url = url self.url = url
self.br = mechanize.Browser()
self.logfile = open(logfile, 'a') self.logfile = open(logfile, 'a')
self.oformat = oformat self.oformat = oformat
self.outputfile = outputfile self.outputfile = outputfile
@ -35,14 +39,47 @@ class Crawler:
allowed_regex += '(!{})'.format(ext) allowed_regex += '(!{})'.format(ext)
self.allowed_regex = '\.({}\w+)$'.format(allowed_regex) self.allowed_regex = '\.({}\w+)$'.format(allowed_regex)
def crawl(self):
def crawl(self, echo=False, pool_size=1):
self.echo = echo
self.regex = re.compile(self.allowed_regex) self.regex = re.compile(self.allowed_regex)
while len(self.urls)>0:
if gevent_installed and pool_size > 1:
self.pool = pool.Pool(pool_size)
self.pool.spawn(self.parse_gevent)
self.pool.join()
else:
while len(self.urls) > 0:
self.parse()
if self.oformat == 'xml':
self.write_xml()
def parse_gevent(self):
self.parse()
while len(self.urls) > 0 and not self.pool.full():
self.pool.spawn(self.parse_gevent)
def parse(self):
if self.echo:
if not gevent_installed:
print('{} pages parsed :: {} pages in the queue'.format(len(self.visited), len(self.urls)))
else:
print('{} pages parsed :: {} parsing processes :: {} pages in the queue'.format(len(self.visited), len(self.pool), len(self.urls)))
# Set the startingpoint for the spider and initialize
# the a mechanize browser object
if not self.urls:
return
else:
url = self.urls.pop()
br = mechanize.Browser()
try: try:
url = self.urls.pop()
self.br.open(url)
for link in self.br.links():
newurl = urlparse.urljoin(link.base_url,link.url)
response = br.open(url)
if response.code >= 400:
self.errlog("Error {} at url {}".format(response.code, url))
return
for link in br.links():
newurl = urlparse.urljoin(link.base_url, link.url)
#print newurl #print newurl
if self.is_valid(newurl): if self.is_valid(newurl):
self.visited.update([newurl]) self.visited.update([newurl])
@ -50,8 +87,9 @@ class Crawler:
except Exception, e: except Exception, e:
self.errlog(e.message) self.errlog(e.message)
if self.oformat == 'xml':
self.write_xml()
br.close()
del(br)
def is_valid(self, url): def is_valid(self, url):
@ -80,3 +118,5 @@ class Crawler:
of.close() of.close()

+ 3
- 13
run.py View File

@ -5,19 +5,9 @@ Example script
""" """
if __name__=='__main__': if __name__=='__main__':
<<<<<<< HEAD
url = 'http://www.ltsvet.ru/' # url from to crawl
url = 'http://www.dksys.ru/' # url from to crawl
logfile = 'errlog.log' # path to logfile logfile = 'errlog.log' # path to logfile
oformat = 'xml' # output format oformat = 'xml' # output format
outputfile = '/srv/www/site/sitemap.xml' # path to output file
outputfile = 'sitemap.xml' # path to output file
crawl = pysitemap.Crawler(url=url, logfile=logfile, oformat=oformat, outputfile=outputfile) crawl = pysitemap.Crawler(url=url, logfile=logfile, oformat=oformat, outputfile=outputfile)
crawl.crawl()
=======
url = 'http://www.example.ru/' # url from to crawl
logfile = 'errlog.log' # path to logfile
oformat = 'xml' # output format
crawl = pysitemap.Crawler(url=url, logfile=logfile, oformat=oformat)
print datetime.datetime.now()
crawl.crawl()
print datetime.datetime.now()
>>>>>>> origin/master
crawl.crawl(pool_size=20)

+ 2
- 2
setup.py View File

@ -12,8 +12,8 @@ setup(
name='sitemap-generator', name='sitemap-generator',
version=get_version( version=get_version(
major=0, major=0,
minor=2,
build=8,
minor=3,
build=4,
), ),
packages=find_packages(exclude=EXCLUDE_FROM_PACKAGES), packages=find_packages(exclude=EXCLUDE_FROM_PACKAGES),
include_package_data=True, include_package_data=True,


+ 1108
- 0
sitemap.xml
File diff suppressed because it is too large
View File


Loading…
Cancel
Save