Sitemap generator
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

46 lines
1.9 KiB

4 years ago
4 years ago
  1. import asyncio
  2. import signal
  3. from pysitemap.base_crawler import Crawler
  4. def crawler(
  5. root_url, out_file, out_format='xml',
  6. maxtasks=10, exclude_urls=[], exclude_imgs=[], image_root_urls=[],
  7. verifyssl=True, findimages=True, images_this_domain=True, headers=None,
  8. timezone_offset=0, changefreq=None, priorities=None):
  9. """
  10. run crowler
  11. :param root_url: Site root url
  12. :param out_file: path to the out file
  13. :param out_format: format of out file [xml, txt]
  14. :param maxtasks: max count of tasks
  15. :param exclude_urls: excludable url paths
  16. :param exclude_imgs: excludable img url paths
  17. :param image_root_urls: recognized image root urls on the domain
  18. :param verifyssl: verify website certificate?
  19. :param findimages: Find images references?
  20. :param images_this_domain: Find images which refer to this domain only?
  21. :param headers: Send these headers in every request
  22. :param timezone_offset: timezone offset for lastmod tags
  23. :param changefreq: dictionary, where key is site sub url regex, and value is changefreq
  24. :param priorities: dictionary, where key is site sub url regex, and value is priority float
  25. :return:
  26. """
  27. loop = asyncio.get_event_loop()
  28. c = Crawler(root_url, out_file=out_file, out_format=out_format,
  29. maxtasks=maxtasks, exclude_urls=exclude_urls, exclude_imgs=exclude_imgs,
  30. image_root_urls=image_root_urls, verifyssl=verifyssl, findimages=findimages,
  31. images_this_domain=images_this_domain, headers=headers, timezone_offset=timezone_offset,
  32. changefreq=changefreq, priorities=priorities)
  33. loop.run_until_complete(c.run())
  34. try:
  35. loop.add_signal_handler(signal.SIGINT, loop.stop)
  36. except RuntimeError:
  37. pass
  38. print('todo_queue:', len(c.todo_queue))
  39. print('busy:', len(c.busy))
  40. print('done:', len(c.done), '; ok:', sum(list(zip(*c.done.values()))[0]) )
  41. print('tasks:', len(c.tasks))