diff --git a/pysitemap/__init__.py b/pysitemap/__init__.py new file mode 100644 index 0000000..e71f191 --- /dev/null +++ b/pysitemap/__init__.py @@ -0,0 +1 @@ +from crawler import Crawler \ No newline at end of file diff --git a/pysitemap/crawler.py b/pysitemap/crawler.py new file mode 100644 index 0000000..e4932ab --- /dev/null +++ b/pysitemap/crawler.py @@ -0,0 +1,84 @@ +import urllib +from bs4 import BeautifulSoup +import urlparse +import mechanize +import pickle +import re + + + +class Crawler: + def __init__(self, url, outputfile='sitemap.xml', logfile='error.log', oformat='xml'): + # Set the startingpoint for the spider and initialize + # the a mechanize browser object + self.url = url + self.br = mechanize.Browser() + self.logfile = open(logfile, 'a') + self.oformat = oformat + self.outputfile = outputfile + + # create lists for the urls in que and visited urls + self.urls = [url] + self.visited = [url] + self.excepted = [] + self.exts = ['htm', 'php'] + self.allowed_regex = '(\w+)\.((?!htm)(?!rar)\w+)$' + + def set_exts(self, exts): + self.exts = exts + + def allow_regex(self, regex=None): + if not regex is None: + self.allowed_regex = regex + else: + allowed_regex = '' + for ext in self.exts: + allowed_regex += '(!{})'.format(ext) + self.allowed_regex = '(\w+)\.({}\w+)$'.format(allowed_regex) + + def crawl(self): + self.regex = re.compile(self.allowed_regex) + while len(self.urls)>0: + try: + self.br.open(self.urls[0]) + for link in self.br.links(): + newurl = urlparse.urljoin(link.base_url,link.url) + #print newurl + if self.is_valid(newurl): + self.visited.append(newurl) + self.urls.append(newurl) + except Exception, e: + self.errlog(e.message) + + self.urls.pop(0) + + if self.oformat == 'xml': + self.write_xml() + + + def is_valid(self, url): + valid = False + if url in self.visited and not url in self.excepted: + return False + if not self.url in url: + return False + if re.search(self.regex, url): + return False + return True + + def errlog(self, msg): + self.logfile.write(msg) + self.logfile.write('\n') + + def write_xml(self): + of = open(self.outputfile, 'w') + of.write('\n') + of.write('\n') + url_str = '{}\n' + for url in self.visited: + of.write(url_str.format(url)) + + of.write('') + of.close() + + diff --git a/run.py b/run.py new file mode 100644 index 0000000..9bb90a5 --- /dev/null +++ b/run.py @@ -0,0 +1,13 @@ +import pysitemap + + +""" +Example script +""" + +if __name__=='__main__': + url = 'http://www.ltsvet.ru/' # url from to crawl + logfile = 'errlog.log' # path to logfile + oformat = 'xml' # output format + crawl = pysitemap.Crawler(url=url, logfile=logfile, oformat=oformat) + crawl.crawl() \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..8d5fa8b --- /dev/null +++ b/setup.py @@ -0,0 +1,37 @@ +from distutils.core import setup +from setuptools import find_packages, setup + +EXCLUDE_FROM_PACKAGES = [] + + +def get_version(major=0, minor=0, build=0): + return '%s.%s.%s' % (major, minor, build) + + +setup( + name='pysitemap', + version=get_version( + major=0, + minor=2, + build=0, + ), + packages=find_packages(exclude=EXCLUDE_FROM_PACKAGES), + include_package_data=True, + url='https://github.com/Haikson/pysitemap', + license='GPL3', + author='Kamo Petrosyan', + author_email='kamo@haikson.com', + description='web crawler and sitemap generator.', + classifiers=[ + 'Development Status :: Release', + 'Environment :: Web Environment', + 'Intended Audience :: Developers', + 'License :: OSI Approved :: BSD License', + 'Operating System :: OS Independent', + 'Programming Language :: Python', + 'Programming Language :: Python :: 2', + 'Programming Language :: Python :: 2.7', + 'Topic :: Software Development :: Libraries :: Python Modules', + ], + requires=['beautifulsoup4', 'mechanize'] +) \ No newline at end of file