Browse Source

0.3.6

pysitemap-python-2.7
Kamo Petrosyan 9 years ago
parent
commit
5d2a2729c6
2 changed files with 39 additions and 65 deletions
  1. +38
    -64
      pysitemap/crawler.py
  2. +1
    -1
      setup.py

+ 38
- 64
pysitemap/crawler.py View File

@ -1,13 +1,16 @@
# ~*~ coding: utf-8 ~*~
__author__ = 'Kamo Petrosyan'
import urllib import urllib
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import urlparse import urlparse
import mechanize import mechanize
import pickle import pickle
import re import re
try:
try:
import sys import sys
import gevent import gevent
from gevent import monkey, pool, queue
from gevent import monkey, pool
monkey.patch_all() monkey.patch_all()
if 'threading' in sys.modules: if 'threading' in sys.modules:
del sys.modules['threading'] del sys.modules['threading']
@ -28,8 +31,7 @@ class Crawler:
# create lists for the urls in que and visited urls # create lists for the urls in que and visited urls
self.urls = set([url]) self.urls = set([url])
self.visited = set([])
self.excepted = set([])
self.visited = set([url])
self.exts = ['htm', 'php'] self.exts = ['htm', 'php']
self.allowed_regex = '\.((?!htm)(?!php)\w+)$' self.allowed_regex = '\.((?!htm)(?!php)\w+)$'
@ -50,77 +52,51 @@ class Crawler:
self.regex = re.compile(self.allowed_regex) self.regex = re.compile(self.allowed_regex)
if gevent_installed and pool_size > 1: if gevent_installed and pool_size > 1:
self.pool = pool.Pool(pool_size) self.pool = pool.Pool(pool_size)
self.queue = gevent.queue.Queue()
self.queue.put(self.url)
self.pool.spawn(self.parse_gevent) self.pool.spawn(self.parse_gevent)
while not self.queue.empty() and not self.pool.free_count() == pool_size:
gevent.sleep(0.1)
while len(self.urls) > 0:
self.queue.put(self.urls.pop())
for x in xrange(0, min(self.queue.qsize(), self.pool.free_count())):
self.pool.spawn(self.parse_gevent)
self.pool.join() self.pool.join()
else: else:
while len(self.urls) > 0: while len(self.urls) > 0:
self.parse() self.parse()
if self.oformat == 'xml': if self.oformat == 'xml':
self.write_xml() self.write_xml()
self.errlog()
def parse_gevent(self): def parse_gevent(self):
url = self.queue.get(timeout=0)
try:
br = mechanize.Browser()
response = br.open(url)
if response.code >= 400:
self.excepted.update([(url, "Error {} at url {}".format(response.code, url))])
return
for link in br.links():
newurl = urlparse.urljoin(link.base_url, link.url)
if self.is_valid(newurl):
self.visited.update([newurl])
self.urls.update([newurl])
except Exception, e:
self.excepted.update([(url, e.message)])
return
except gevent.queue.Empty:
br.close()
if self.echo:
print('{} pages parsed :: {} parsing processes :: {} pages in the queue'.format(len(self.visited), len(self.pool), self.queue.qsize()))
return
br.close()
if self.echo:
print('{} pages parsed :: {} parsing processes :: {} pages in the queue'.format(len(self.visited), len(self.pool), self.queue.qsize()))
self.parse()
while len(self.urls) > 0 and not self.pool.full():
self.pool.spawn(self.parse_gevent)
def parse(self): def parse(self):
if self.echo: if self.echo:
print('{} pages parsed :: {} pages in the queue'.format(len(self.visited), len(self.urls)))
if not gevent_installed:
print('{} pages parsed :: {} pages in the queue'.format(len(self.visited), len(self.urls)))
else:
print('{} pages parsed :: {} parsing processes :: {} pages in the queue'.format(len(self.visited), len(self.pool), len(self.urls)))
# Set the startingpoint for the spider and initialize # Set the startingpoint for the spider and initialize
# the a mechanize browser object # the a mechanize browser object
url = self.urls.pop()
br = mechanize.Browser()
try:
response = br.open(url)
if response.code >= 400:
self.excepted.update([(url, "Error {} at url {}".format(response.code, url))])
return
for link in br.links():
newurl = urlparse.urljoin(link.base_url, link.url)
#print newurl
if self.is_valid(newurl):
self.urls.update([newurl])
self.visited.update([url])
except Exception, e:
self.excepted.update([(url, e.message)])
if not self.urls:
return
else:
url = self.urls.pop()
br = mechanize.Browser()
try:
response = br.open(url)
if response.code >= 400:
self.errlog("Error {} at url {}".format(response.code, url))
return
for link in br.links():
newurl = urlparse.urljoin(link.base_url, link.url)
#print newurl
if self.is_valid(newurl):
self.visited.update([newurl])
self.urls.update([newurl])
except Exception, e:
self.errlog(e.message)
br.close()
del(br)
br.close()
del(br)
@ -134,11 +110,9 @@ class Crawler:
return False return False
return True return True
def errlog(self):
while len(self.excepted) > 0:
ex = self.excepted.pop()
self.logfile.write('{}\n'.format('\t'.join(ex)))
self.logfile.close()
def errlog(self, msg):
self.logfile.write(msg)
self.logfile.write('\n')
def write_xml(self): def write_xml(self):
of = open(self.outputfile, 'w') of = open(self.outputfile, 'w')
@ -149,4 +123,4 @@ class Crawler:
of.write(url_str.format(self.visited.pop())) of.write(url_str.format(self.visited.pop()))
of.write('</urlset>') of.write('</urlset>')
of.close()
of.close()

+ 1
- 1
setup.py View File

@ -13,7 +13,7 @@ setup(
version=get_version( version=get_version(
major=0, major=0,
minor=3, minor=3,
build=5,
build=6,
), ),
packages=find_packages(exclude=EXCLUDE_FROM_PACKAGES), packages=find_packages(exclude=EXCLUDE_FROM_PACKAGES),
include_package_data=True, include_package_data=True,


Loading…
Cancel
Save