|
|
- import asyncio
- import signal
- from pysitemap.base_crawler import Crawler
-
-
- def crawler(
- root_url, out_file, out_format='xml',
- maxtasks=10, exclude_urls=[], exclude_imgs=[], verifyssl=True,
- headers=None, timezone_offset=0, changefreq=None,
- priorities=None):
- """
- run crowler
- :param root_url: Site root url
- :param out_file: path to the out file
- :param out_format: format of out file [xml, txt]
- :param maxtasks: max count of tasks
- :param exclude_urls: excludable url paths
- :param exclude_imgs: excludable img url paths
- :param verifyssl: verify website certificate?
- :param headers: Send these headers in every request
- :param timezone_offset: timezone offset for lastmod tags
- :param changefreq: dictionary, where key is site sub url regex, and value is changefreq
- :param priorities: dictionary, where key is site sub url regex, and value is priority float
- :return:
- """
- loop = asyncio.get_event_loop()
-
- c = Crawler(root_url, out_file=out_file, out_format=out_format,
- maxtasks=maxtasks, exclude_urls=exclude_urls, exclude_imgs=exclude_imgs, verifyssl=verifyssl,
- headers=headers, timezone_offset=timezone_offset,
- changefreq=changefreq, priorities=priorities)
-
- loop.run_until_complete(c.run())
-
- try:
- loop.add_signal_handler(signal.SIGINT, loop.stop)
- except RuntimeError:
- pass
- print('todo_queue:', len(c.todo_queue))
- print('busy:', len(c.busy))
- print('done:', len(c.done), '; ok:', sum(list(zip(*c.done.values()))[0]) )
- print('tasks:', len(c.tasks))
|