Sitemap generator
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

47 lines
2.1 KiB

import asyncio
import signal
from pysitemap.base_crawler import Crawler
def crawler(
root_url, out_file, out_format='xml',
maxtasks=10, exclude_urls=[], exclude_imgs=[], image_root_urls=[],
use_lastmodified=True, verifyssl=True, findimages=True, images_this_domain=True,
headers=None, timezone_offset=0, changefreq=None, priorities=None):
"""
run crowler
:param root_url: Site root url
:param out_file: path to the out file
:param out_format: format of out file [xml, txt]
:param maxtasks: max count of tasks
:param exclude_urls: excludable url paths
:param exclude_imgs: excludable img url paths
:param image_root_urls: recognized image root urls on the domain
:param use_lastmodified: enable or disable timestamps for fetched urls?
:param verifyssl: verify website certificate?
:param findimages: Find images references?
:param images_this_domain: Find images which refer to this domain only?
:param headers: Send these headers in every request
:param timezone_offset: timezone offset for lastmod tags
:param changefreq: dictionary, where key is site sub url regex, and value is changefreq
:param priorities: dictionary, where key is site sub url regex, and value is priority float
:return:
"""
loop = asyncio.get_event_loop()
c = Crawler(root_url, out_file=out_file, out_format=out_format,
maxtasks=maxtasks, exclude_urls=exclude_urls, exclude_imgs=exclude_imgs,
image_root_urls=image_root_urls, use_lastmodified=use_lastmodified, verifyssl=verifyssl,
findimages=findimages, images_this_domain=images_this_domain, headers=headers,
timezone_offset=timezone_offset, changefreq=changefreq, priorities=priorities)
loop.run_until_complete(c.run())
try:
loop.add_signal_handler(signal.SIGINT, loop.stop)
except RuntimeError:
pass
print('todo_queue:', len(c.todo_queue))
print('busy:', len(c.busy))
print('done:', len(c.done), '; ok:', sum(list(zip(*c.done.values()))[0]) )
print('tasks:', len(c.tasks))