Sitemap generator
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

77 lines
2.5 KiB

4 years ago
4 years ago
  1. import asyncio
  2. import re
  3. import urllib.parse
  4. from pysitemap.format_processors.xml import XMLWriter
  5. from pysitemap.format_processors.text import TextWriter
  6. import aiohttp
  7. class Crawler:
  8. format_processors = {
  9. 'xml': XMLWriter,
  10. 'txt': TextWriter
  11. }
  12. def __init__(self, rooturl, out_file, out_format='xml', maxtasks=100):
  13. self.rooturl = rooturl
  14. self.todo = set()
  15. self.busy = set()
  16. self.done = {}
  17. self.tasks = set()
  18. self.sem = asyncio.Semaphore(maxtasks)
  19. # connector stores cookies between requests and uses connection pool
  20. self.session = aiohttp.ClientSession()
  21. self.writer = self.format_processors.get(out_format)(out_file)
  22. async def run(self):
  23. t = asyncio.ensure_future(self.addurls([(self.rooturl, '')]))
  24. await asyncio.sleep(1)
  25. while self.busy:
  26. await asyncio.sleep(1)
  27. await t
  28. await self.session.close()
  29. await self.writer.write([key for key, value in self.done.items() if value])
  30. async def addurls(self, urls):
  31. for url, parenturl in urls:
  32. url = urllib.parse.urljoin(parenturl, url)
  33. url, frag = urllib.parse.urldefrag(url)
  34. if (url.startswith(self.rooturl) and
  35. url not in self.busy and
  36. url not in self.done and
  37. url not in self.todo):
  38. self.todo.add(url)
  39. await self.sem.acquire()
  40. task = asyncio.ensure_future(self.process(url))
  41. task.add_done_callback(lambda t: self.sem.release())
  42. task.add_done_callback(self.tasks.remove)
  43. self.tasks.add(task)
  44. async def process(self, url):
  45. print('processing:', url)
  46. self.todo.remove(url)
  47. self.busy.add(url)
  48. try:
  49. resp = await self.session.get(url)
  50. except Exception as exc:
  51. print('...', url, 'has error', repr(str(exc)))
  52. self.done[url] = False
  53. else:
  54. if (resp.status == 200 and
  55. ('text/html' in resp.headers.get('content-type'))):
  56. data = (await resp.read()).decode('utf-8', 'replace')
  57. urls = re.findall(r'(?i)href=["\']?([^\s"\'<>]+)', data)
  58. asyncio.Task(self.addurls([(u, url) for u in urls]))
  59. resp.close()
  60. self.done[url] = True
  61. self.busy.remove(url)
  62. print(len(self.done), 'completed tasks,', len(self.tasks),
  63. 'still pending, todo', len(self.todo))