Sitemap generator
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

114 lines
3.8 KiB

4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
  1. import logging
  2. import asyncio
  3. import re
  4. import urllib.parse
  5. from pysitemap.format_processors.xml import XMLWriter
  6. from pysitemap.format_processors.text import TextWriter
  7. import aiohttp
  8. class Crawler:
  9. format_processors = {
  10. 'xml': XMLWriter,
  11. 'txt': TextWriter
  12. }
  13. def __init__(self, rooturl, out_file, out_format='xml', maxtasks=100,
  14. todo_queue_backend=set, done_backend=dict):
  15. """
  16. Crawler constructor
  17. :param rooturl: root url of site
  18. :type rooturl: str
  19. :param out_file: file to save sitemap result
  20. :type out_file: str
  21. :param out_format: sitemap type [xml | txt]. Default xml
  22. :type out_format: str
  23. :param maxtasks: maximum count of tasks. Default 100
  24. :type maxtasks: int
  25. """
  26. self.rooturl = rooturl
  27. self.todo_queue = todo_queue_backend()
  28. self.busy = set()
  29. self.done = done_backend()
  30. self.tasks = set()
  31. self.sem = asyncio.Semaphore(maxtasks)
  32. # connector stores cookies between requests and uses connection pool
  33. self.session = aiohttp.ClientSession()
  34. self.writer = self.format_processors.get(out_format)(out_file)
  35. async def run(self):
  36. """
  37. Main function to start parsing site
  38. :return:
  39. """
  40. t = asyncio.ensure_future(self.addurls([(self.rooturl, '')]))
  41. await asyncio.sleep(1)
  42. while self.busy:
  43. await asyncio.sleep(1)
  44. await t
  45. await self.session.close()
  46. await self.writer.write([key for key, value in self.done.items() if value])
  47. async def addurls(self, urls):
  48. """
  49. Add urls in queue and run process to parse
  50. :param urls:
  51. :return:
  52. """
  53. for url, parenturl in urls:
  54. url = urllib.parse.urljoin(parenturl, url)
  55. url, frag = urllib.parse.urldefrag(url)
  56. if (url.startswith(self.rooturl) and
  57. url not in self.busy and
  58. url not in self.done and
  59. url not in self.todo_queue):
  60. self.todo_queue.add(url)
  61. # Acquire semaphore
  62. await self.sem.acquire()
  63. # Create async task
  64. task = asyncio.ensure_future(self.process(url))
  65. # Add collback into task to release semaphore
  66. task.add_done_callback(lambda t: self.sem.release())
  67. # Callback to remove task from tasks
  68. task.add_done_callback(self.tasks.remove)
  69. # Add task into tasks
  70. self.tasks.add(task)
  71. async def process(self, url):
  72. """
  73. Process single url
  74. :param url:
  75. :return:
  76. """
  77. print('processing:', url)
  78. # remove url from basic queue and add it into busy list
  79. self.todo_queue.remove(url)
  80. self.busy.add(url)
  81. try:
  82. resp = await self.session.get(url) # await response
  83. except Exception as exc:
  84. # on any exception mark url as BAD
  85. print('...', url, 'has error', repr(str(exc)))
  86. self.done[url] = False
  87. else:
  88. # only url with status == 200 and content type == 'text/html' parsed
  89. if (resp.status == 200 and
  90. ('text/html' in resp.headers.get('content-type'))):
  91. data = (await resp.read()).decode('utf-8', 'replace')
  92. urls = re.findall(r'(?i)href=["\']?([^\s"\'<>]+)', data)
  93. asyncio.Task(self.addurls([(u, url) for u in urls]))
  94. # even if we have no exception, we can mark url as good
  95. resp.close()
  96. self.done[url] = True
  97. self.busy.remove(url)
  98. logging.info(len(self.done), 'completed tasks,', len(self.tasks),
  99. 'still pending, todo_queue', len(self.todo_queue))