Sitemap generator
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

68 lines
2.1 KiB

4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
  1. #!/usr/bin/env python3
  2. import time
  3. import asyncio
  4. from aiohttp import ClientSession
  5. from parsel import Selector
  6. from urllib.parse import urlparse, urlunparse
  7. class Crawler(object):
  8. def __init__(self, url, sleep_time=.5):
  9. self.urls = [url]
  10. scheme, netloc, path, params, query, fragment = urlparse(url)
  11. if not netloc:
  12. netloc, path = path, netloc
  13. url = urlunparse((scheme, netloc, "", params, "", fragment))
  14. self.base_url = url
  15. self.sleep_time = float(sleep_time)
  16. async def fetch(self, url, session):
  17. async with session.get(url) as response:
  18. await asyncio.sleep(self.sleep_time)
  19. return response.content
  20. async def bound_fetch(self, sem, url, session):
  21. # Getter function with semaphore.
  22. async with sem:
  23. await self.fetch(url, session)
  24. def norm_link(self, url:str):
  25. if url.startswith(self.base_url):
  26. return url
  27. elif url.startswith('//'):
  28. return "{scheme}{url}".format(
  29. scheme=self.base_url[:self.base_url.find(":")],
  30. url=url
  31. )
  32. elif url.startswith("/"):
  33. return self.base_url + url
  34. return None
  35. async def parse(self, content):
  36. sel = Selector(content)
  37. links = sel.xpath('//a/@href').getall()
  38. normalized_links = []
  39. for link in links:
  40. link = self.norm_link(link)
  41. if link:
  42. normalized_links.append(link)
  43. self.urls.extend(normalized_links)
  44. async def run(self):
  45. tasks = []
  46. # create instance of Semaphore
  47. sem = asyncio.Semaphore(20)
  48. # Create client session that will ensure we dont open new connection
  49. # per each request.
  50. async with ClientSession() as session:
  51. for url in self.urls:
  52. # pass Semaphore and session to every GET request
  53. task = asyncio.ensure_future(self.bound_fetch(sem, url, session))
  54. tasks.append(task)
  55. responses = asyncio.gather(*tasks)
  56. await responses