Sitemap generator
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

260 lines
9.3 KiB

4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
  1. import logging
  2. import asyncio
  3. import re
  4. import urllib.parse
  5. from pysitemap.format_processors.xml import XMLWriter
  6. from pysitemap.format_processors.text import TextWriter
  7. import aiohttp
  8. class Crawler:
  9. format_processors = {
  10. 'xml': XMLWriter,
  11. 'txt': TextWriter
  12. }
  13. def __init__(self, rooturl, out_file, out_format='xml', maxtasks=10, exclude_urls=list, exclude_imgs=list,
  14. image_root_urls=list, verifyssl=True, headers=None, timezone_offset=0, changefreq=None, priorities=None,
  15. todo_queue_backend=set, done_backend=dict, done_images=list):
  16. """
  17. Crawler constructor
  18. :param rooturl: root url of site
  19. :type rooturl: str
  20. :param out_file: file to save sitemap result
  21. :type out_file: str
  22. :param out_format: sitemap type [xml | txt]. Default xml
  23. :type out_format: str
  24. :param maxtasks: maximum count of tasks. Default 10
  25. :type maxtasks: int
  26. :param exclude_urls: excludable url paths relative to root url
  27. :type exclude_urls: list
  28. :param exclude_imgs: excludable img url paths relative to root url
  29. :type exclude_imgs: list
  30. :param image_root_urls: recognized image root urls on the domain
  31. :type image_root_urls: list
  32. :param verifyssl: verify website certificate?
  33. :type verifyssl: boolean
  34. :param timezone_offset: timezone offset for lastmod tags
  35. :type timezone_offset: int
  36. :param changefreq: dictionary, where key is site sub url regex, and value is changefreq
  37. :type changefreq: dict
  38. :param priorities: dictionary, where key is site sub url regex, and value is priority float
  39. :type priorities: dict
  40. """
  41. self.rooturl = rooturl
  42. self.exclude_urls = exclude_urls
  43. self.exclude_imgs = exclude_imgs
  44. self.image_root_urls = image_root_urls
  45. self.todo_queue = todo_queue_backend()
  46. self.busy = set()
  47. self.done = done_backend()
  48. self.done_images = done_images()
  49. self.tasks = set()
  50. self.sem = asyncio.Semaphore(maxtasks)
  51. self.timezone_offset = timezone_offset
  52. self.changefreq = changefreq
  53. self.priorities = priorities
  54. # connector stores cookies between requests and uses connection pool
  55. self.session = aiohttp.ClientSession(
  56. headers=headers,
  57. connector=aiohttp.TCPConnector(verify_ssl=verifyssl)
  58. )
  59. self.writer = self.format_processors.get(out_format)(out_file)
  60. async def run(self):
  61. """
  62. Main function to start parsing site
  63. :return:
  64. """
  65. t = asyncio.ensure_future(self.addurls([(self.rooturl, '')]))
  66. await asyncio.sleep(1)
  67. while self.busy:
  68. await asyncio.sleep(1)
  69. await t
  70. await self.session.close()
  71. await self.writer.write([(key, value) for key, value in self.done.items() if key and value], self.timezone_offset)
  72. async def contains(self, url, regex, rlist=True):
  73. """
  74. Does url path matches a value in regex_list?
  75. """
  76. retvalue = False
  77. if rlist:
  78. for exc in regex:
  79. retvalue = bool(re.search(re.compile(r"{}".format(exc)), url))
  80. if retvalue: return retvalue
  81. else:
  82. retvalue = bool(re.search(re.compile(r"{}".format(regex)), url))
  83. return retvalue
  84. async def urldict(self, url, url_dict):
  85. """
  86. Parse URL regex (key) and value pairs
  87. """
  88. for urlkey, regvalue in url_dict.items():
  89. if await self.contains(url, urlkey, rlist=False):
  90. return regvalue
  91. return None
  92. async def addurls(self, urls):
  93. """
  94. Add urls in queue and run process to parse
  95. :param urls:
  96. :return:
  97. """
  98. for url, parenturl in urls:
  99. url = urllib.parse.urljoin(parenturl, url)
  100. url, frag = urllib.parse.urldefrag(url)
  101. if (url.startswith(self.rooturl) and
  102. not await self.contains(url, self.exclude_urls, rlist=True) and
  103. url not in self.busy and
  104. url not in self.done and
  105. url not in self.todo_queue):
  106. self.todo_queue.add(url)
  107. # Acquire semaphore
  108. await self.sem.acquire()
  109. # Create async task
  110. task = asyncio.ensure_future(self.process(url))
  111. # Add collback into task to release semaphore
  112. task.add_done_callback(lambda t: self.sem.release())
  113. # Callback to remove task from tasks
  114. task.add_done_callback(self.tasks.remove)
  115. # Add task into tasks
  116. self.tasks.add(task)
  117. async def mimechecker(self, url, expected):
  118. """
  119. Check url resource mimetype
  120. """
  121. self.todo_queue.remove(url)
  122. self.busy.add(url)
  123. try:
  124. resp = await self.session.get(url)
  125. except Exception as exc:
  126. pass
  127. else:
  128. mime = resp.headers.get('content-type')
  129. if (resp.status == 200 and
  130. bool(re.search(re.compile(r"{}".format(expected)), mime))):
  131. resp.close()
  132. self.busy.remove(url)
  133. return True
  134. resp.close()
  135. self.busy.remove(url)
  136. return False
  137. async def addimages(self, data, url):
  138. """
  139. Find all images in website data
  140. """
  141. imgs = []
  142. imgs_ok = []
  143. lines_tmp = []
  144. tag = False
  145. for line in data.split('\n'):
  146. if re.search(r'<img', line):
  147. tag = True
  148. if re.search(r'<img', line) and re.search(r'\/>', line):
  149. tag = False
  150. lines_tmp.append(line)
  151. continue
  152. if re.search(r'\/>', line) and tag:
  153. tag = False
  154. if tag:
  155. lines_tmp.append(line)
  156. imgs = re.findall(r'(?i)src=["\']?([^\s\"\'<>]+)', str(lines_tmp))
  157. for img in imgs:
  158. image_url = ""
  159. if not await self.contains(img, self.exclude_imgs, rlist=True):
  160. if img.startswith(self.rooturl):
  161. image_url = img
  162. elif not img.startswith("http"):
  163. for image_root_url in self.image_root_urls:
  164. if url.startswith(image_root_url):
  165. image_url = image_root_url + img
  166. break
  167. if (image_url != "" and
  168. image_url not in self.done_images and
  169. image_url not in self.busy and
  170. image_url not in self.todo_queue):
  171. self.todo_queue.add(image_url)
  172. # Acquire semaphore
  173. await self.sem.acquire()
  174. # Create async task
  175. task = asyncio.ensure_future(self.mimechecker(image_url, '^image\/'))
  176. # Add collback into task to release semaphore
  177. task.add_done_callback(lambda t: self.sem.release())
  178. # Callback to remove task from tasks
  179. task.add_done_callback(self.tasks.remove)
  180. # Add task into tasks
  181. self.tasks.add(task)
  182. try:
  183. result = await asyncio.wait_for(task, timeout=20)
  184. if (result):
  185. imgs_ok.append(image_url)
  186. except asyncio.TimeoutError:
  187. print("couldn't add image:", image_url)
  188. task.cancel()
  189. pass
  190. self.done_images.extend(imgs_ok)
  191. return imgs_ok
  192. async def process(self, url):
  193. """
  194. Process single url
  195. :param url:
  196. :return:
  197. """
  198. print('processing:', url)
  199. # remove url from basic queue and add it into busy list
  200. self.todo_queue.remove(url)
  201. self.busy.add(url)
  202. lastmod = None
  203. cf = None
  204. pr = None
  205. imgs = []
  206. try:
  207. resp = await self.session.get(url) # await response
  208. except Exception as exc:
  209. # on any exception mark url as BAD
  210. print('...', url, 'has error', repr(str(exc)))
  211. self.done[url] = [False, lastmod, cf, pr, imgs]
  212. else:
  213. # only url with status == 200 and content type == 'text/html' parsed
  214. if (resp.status == 200 and
  215. ('text/html' in resp.headers.get('content-type'))):
  216. data = (await resp.read()).decode('utf-8', 'replace')
  217. urls = re.findall(r'(?i)href=["\']?([^\s"\'<>]+)', data)
  218. lastmod = resp.headers.get('last-modified')
  219. imgs = await self.addimages(data, url)
  220. asyncio.Task(self.addurls([(u, url) for u in urls]))
  221. try: pr = await self.urldict(url, self.changefreq)
  222. except IndexError: pass
  223. try: cf = await self.urldict(url, self.priorities)
  224. except IndexError: pass
  225. # even if we have no exception, we can mark url as good
  226. resp.close()
  227. self.done[url] = [True, lastmod, cf, pr, imgs]
  228. self.busy.remove(url)
  229. logging.info(len(self.done), 'completed tasks,', len(self.tasks),
  230. 'still pending, todo_queue', len(self.todo_queue))