Browse Source

using sets

pysitemap-python-2.7
Kamo Petrosyan 9 years ago
parent
commit
06d9116d20
4 changed files with 3755 additions and 15 deletions
  1. +9
    -11
      pysitemap/crawler.py
  2. +5
    -3
      run.py
  3. +1
    -1
      setup.py
  4. +3740
    -0
      sitemap.xml

+ 9
- 11
pysitemap/crawler.py View File

@ -18,9 +18,8 @@ class Crawler:
self.outputfile = outputfile self.outputfile = outputfile
# create lists for the urls in que and visited urls # create lists for the urls in que and visited urls
self.urls = [url]
self.visited = [url]
self.excepted = []
self.urls = set([url])
self.visited = set([url])
self.exts = ['htm', 'php'] self.exts = ['htm', 'php']
self.allowed_regex = '(\w+)\.((?!htm)(?!rar)\w+)$' self.allowed_regex = '(\w+)\.((?!htm)(?!rar)\w+)$'
@ -40,25 +39,24 @@ class Crawler:
self.regex = re.compile(self.allowed_regex) self.regex = re.compile(self.allowed_regex)
while len(self.urls)>0: while len(self.urls)>0:
try: try:
self.br.open(self.urls[0])
url = self.urls.pop()
self.br.open(url)
for link in self.br.links(): for link in self.br.links():
newurl = urlparse.urljoin(link.base_url,link.url) newurl = urlparse.urljoin(link.base_url,link.url)
#print newurl #print newurl
if self.is_valid(newurl): if self.is_valid(newurl):
self.visited.append(newurl)
self.urls.append(newurl)
self.visited.update([newurl])
self.urls.update([newurl])
except Exception, e: except Exception, e:
self.errlog(e.message) self.errlog(e.message)
self.urls.pop(0)
if self.oformat == 'xml': if self.oformat == 'xml':
self.write_xml() self.write_xml()
def is_valid(self, url): def is_valid(self, url):
valid = False valid = False
if url in self.visited and not url in self.excepted:
if url in self.visited:
return False return False
if not self.url in url: if not self.url in url:
return False return False
@ -75,8 +73,8 @@ class Crawler:
of.write('<?xml version="1.0" encoding="utf-8"?><!--Generated by Screaming Frog SEO Spider 2,55-->\n') of.write('<?xml version="1.0" encoding="utf-8"?><!--Generated by Screaming Frog SEO Spider 2,55-->\n')
of.write('<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd">\n') of.write('<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd">\n')
url_str = '<url><loc>{}</loc></url>\n' url_str = '<url><loc>{}</loc></url>\n'
for url in self.visited:
of.write(url_str.format(url))
while self.visited:
of.write(url_str.format(self.visited.pop()))
of.write('</urlset>') of.write('</urlset>')
of.close() of.close()


+ 5
- 3
run.py View File

@ -1,13 +1,15 @@
import pysitemap import pysitemap
import datetime
""" """
Example script Example script
""" """
if __name__=='__main__': if __name__=='__main__':
url = 'http://www.ltsvet.ru/' # url from to crawl
url = 'http://www.techelec.ru/' # url from to crawl
logfile = 'errlog.log' # path to logfile logfile = 'errlog.log' # path to logfile
oformat = 'xml' # output format oformat = 'xml' # output format
crawl = pysitemap.Crawler(url=url, logfile=logfile, oformat=oformat) crawl = pysitemap.Crawler(url=url, logfile=logfile, oformat=oformat)
crawl.crawl()
print datetime.datetime.now()
crawl.crawl()
print datetime.datetime.now()

+ 1
- 1
setup.py View File

@ -13,7 +13,7 @@ setup(
version=get_version( version=get_version(
major=0, major=0,
minor=2, minor=2,
build=2,
build=3,
), ),
packages=find_packages(exclude=EXCLUDE_FROM_PACKAGES), packages=find_packages(exclude=EXCLUDE_FROM_PACKAGES),
include_package_data=True, include_package_data=True,


+ 3740
- 0
sitemap.xml
File diff suppressed because it is too large
View File


Loading…
Cancel
Save