Browse Source

Python 3 compatible

pysitemap-python-2.7
Kamo Petrosyan 6 years ago
parent
commit
71ac413b14
5 changed files with 1472 additions and 1113 deletions
  1. +1
    -1
      pysitemap/__init__.py
  2. +7
    -3
      pysitemap/crawler.py
  3. +2
    -2
      run.py
  4. +3
    -2
      setup.py
  5. +1459
    -1105
      sitemap.xml

+ 1
- 1
pysitemap/__init__.py View File

@ -1 +1 @@
from crawler import Crawler
from .crawler import Crawler

+ 7
- 3
pysitemap/crawler.py View File

@ -1,7 +1,10 @@
# -*- coding: utf-8 -*-
import __future__
import sys
import urlparse
if sys.version_info.major == 2:
import urlparse
else:
from urllib import parse as urlparse
import requests
from lxml import html
import re
@ -57,6 +60,7 @@ class Crawler:
self.pool.spawn(self.parse_gevent)
self.pool.join()
else:
self.pool = [None,] # fixing n_poll exception in self.parse with poolsize > 1 and gevent_installed == False
while len(self.urls) > 0:
self.parse()
if self.oformat == 'xml':
@ -105,8 +109,8 @@ class Crawler:
if self.is_valid(newurl):
self.visited.update([newurl])
self.urls.update([newurl])
except Exception, e:
self.errlog(e.message)
except Exception as e:
self.errlog(repr(e))
def is_valid(self, url):
if '#' in url:


+ 2
- 2
run.py View File

@ -8,9 +8,9 @@ To install gevent:
"""
if __name__ == '__main__':
url = 'http://www.example.com/' # url from to crawl
url = 'http://www.lumpro.ru/' # url from to crawl
logfile = 'errlog.log' # path to logfile
oformat = 'xml' # output format
outputfile = 'sitemap.xml' # path to output file
crawl = pysitemap.Crawler(url=url, logfile=logfile, oformat=oformat, outputfile=outputfile)
crawl.crawl(pool_size=20)
crawl.crawl(pool_size=20, echo=True)

+ 3
- 2
setup.py View File

@ -12,8 +12,8 @@ setup(
name='sitemap-generator',
version=get_version(
major=0,
minor=4,
build=4,
minor=5,
build=0,
),
packages=find_packages(exclude=EXCLUDE_FROM_PACKAGES),
include_package_data=True,
@ -30,6 +30,7 @@ setup(
'Programming Language :: Python',
'Programming Language :: Python :: 2',
'Programming Language :: Python :: 2.7',
'Programming Language :: Python :: 3',
'Topic :: Software Development :: Libraries :: Python Modules',
],
install_requires=['lxml', 'requests'],


+ 1459
- 1105
sitemap.xml
File diff suppressed because it is too large
View File


Loading…
Cancel
Save