Sitemap generator
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

218 lines
7.7 KiB

4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
  1. import logging
  2. import asyncio
  3. import re
  4. import urllib.parse
  5. from pysitemap.format_processors.xml import XMLWriter
  6. from pysitemap.format_processors.text import TextWriter
  7. import aiohttp
  8. class Crawler:
  9. format_processors = {
  10. 'xml': XMLWriter,
  11. 'txt': TextWriter
  12. }
  13. def __init__(self, rooturl, out_file, out_format='xml', maxtasks=10, exclude_urls=[], exclude_imgs=[],
  14. verifyssl=True, headers=None, timezone_offset=0, changefreq=None, priorities=None,
  15. todo_queue_backend=set, done_backend=dict):
  16. """
  17. Crawler constructor
  18. :param rooturl: root url of site
  19. :type rooturl: str
  20. :param out_file: file to save sitemap result
  21. :type out_file: str
  22. :param out_format: sitemap type [xml | txt]. Default xml
  23. :type out_format: str
  24. :param maxtasks: maximum count of tasks. Default 10
  25. :type maxtasks: int
  26. :param exclude_urls: excludable url paths relative to root url
  27. :type exclude_urls: list
  28. :param exclude_imgs: excludable img url paths relative to root url
  29. :type exclude_imgs: list
  30. :param verifyssl: verify website certificate?
  31. :type verifyssl: boolean
  32. :param timezone_offset: timezone offset for lastmod tags
  33. :type timezone_offset: int
  34. :param changefreq: dictionary, where key is site sub url regex, and value is changefreq
  35. :type changefreq: dict
  36. :param priorities: dictionary, where key is site sub url regex, and value is priority float
  37. :type priorities: dict
  38. """
  39. self.rooturl = rooturl
  40. self.exclude_urls = exclude_urls
  41. self.exclude_imgs = exclude_imgs
  42. self.todo_queue = todo_queue_backend()
  43. self.busy = set()
  44. self.done = done_backend()
  45. self.tasks = set()
  46. self.sem = asyncio.Semaphore(maxtasks)
  47. self.timezone_offset = timezone_offset
  48. self.changefreq = changefreq
  49. self.priorities = priorities
  50. # connector stores cookies between requests and uses connection pool
  51. self.session = aiohttp.ClientSession(
  52. headers=headers,
  53. connector=aiohttp.TCPConnector(verify_ssl=verifyssl)
  54. )
  55. self.writer = self.format_processors.get(out_format)(out_file)
  56. async def run(self):
  57. """
  58. Main function to start parsing site
  59. :return:
  60. """
  61. t = asyncio.ensure_future(self.addurls([(self.rooturl, '')]))
  62. await asyncio.sleep(1)
  63. while self.busy:
  64. await asyncio.sleep(1)
  65. await t
  66. await self.session.close()
  67. await self.writer.write([(key, value) for key, value in self.done.items() if key and value], self.timezone_offset)
  68. async def contains(self, url, regex, rlist=True):
  69. """
  70. Does url path matches a value in regex_list?
  71. """
  72. retvalue = False
  73. if rlist:
  74. for exc in regex:
  75. retvalue = bool(re.search(re.compile(r"{}".format(exc)), url))
  76. if retvalue: return retvalue
  77. else:
  78. retvalue = bool(re.search(re.compile(r"{}".format(regex)), url))
  79. return retvalue
  80. async def urldict(self, url, url_dict):
  81. """
  82. Parse URL regex (key) and value pairs
  83. """
  84. for urlkey, regvalue in url_dict.items():
  85. if await self.contains(url, urlkey, rlist=False):
  86. return regvalue
  87. return None
  88. async def addurls(self, urls):
  89. """
  90. Add urls in queue and run process to parse
  91. :param urls:
  92. :return:
  93. """
  94. for url, parenturl in urls:
  95. url = urllib.parse.urljoin(parenturl, url)
  96. url, frag = urllib.parse.urldefrag(url)
  97. if (url.startswith(self.rooturl) and
  98. not await self.contains(url, self.exclude_urls, rlist=True) and
  99. url not in self.busy and
  100. url not in self.done and
  101. url not in self.todo_queue):
  102. self.todo_queue.add(url)
  103. # Acquire semaphore
  104. await self.sem.acquire()
  105. # Create async task
  106. task = asyncio.ensure_future(self.process(url))
  107. # Add collback into task to release semaphore
  108. task.add_done_callback(lambda t: self.sem.release())
  109. # Callback to remove task from tasks
  110. task.add_done_callback(self.tasks.remove)
  111. # Add task into tasks
  112. self.tasks.add(task)
  113. async def mimechecker(self, url, expected):
  114. """
  115. Check url resource mimetype
  116. """
  117. try:
  118. resp = await self.session.get(url)
  119. except Exception as exc:
  120. pass
  121. else:
  122. mime = resp.headers.get('content-type')
  123. if (resp.status == 200 and
  124. bool(re.search(re.compile(r"{}".format(expected)), mime))):
  125. return True
  126. return False
  127. async def addimages(self, data):
  128. """
  129. Find all images in website data
  130. """
  131. imgs = []
  132. imgs_ok = []
  133. lines_tmp = []
  134. tag = False
  135. for line in data.split('\n'):
  136. if re.search(r'<img', line):
  137. tag = True
  138. if re.search(r'<img', line) and re.search(r'\/>', line):
  139. tag = False
  140. lines_tmp.append(line)
  141. continue
  142. if re.search(r'\/>', line) and tag:
  143. tag = False
  144. if tag:
  145. lines_tmp.append(line)
  146. imgs = re.findall(r'(?i)src=["\']?([^\s\"\'<>]+)', str(lines_tmp))
  147. for img in imgs:
  148. if not await self.contains(img, self.exclude_imgs, rlist=True):
  149. if img.startswith(self.rooturl):
  150. if await self.mimechecker(img, '^image\/'):
  151. imgs_ok.append(img)
  152. elif not img.startswith("http"):
  153. if await self.mimechecker(img, '^image\/'):
  154. imgs_ok.append(re.sub('/$', '', self.rooturl) + img)
  155. return imgs_ok
  156. async def process(self, url):
  157. """
  158. Process single url
  159. :param url:
  160. :return:
  161. """
  162. print('processing:', url)
  163. # remove url from basic queue and add it into busy list
  164. self.todo_queue.remove(url)
  165. self.busy.add(url)
  166. lastmod = None
  167. cf = None
  168. pr = None
  169. imgs = []
  170. try:
  171. resp = await self.session.get(url) # await response
  172. except Exception as exc:
  173. # on any exception mark url as BAD
  174. print('...', url, 'has error', repr(str(exc)))
  175. self.done[url] = [False, lastmod, cf, pr, imgs]
  176. else:
  177. # only url with status == 200 and content type == 'text/html' parsed
  178. if (resp.status == 200 and
  179. ('text/html' in resp.headers.get('content-type'))):
  180. data = (await resp.read()).decode('utf-8', 'replace')
  181. urls = re.findall(r'(?i)href=["\']?([^\s"\'<>]+)', data)
  182. lastmod = resp.headers.get('last-modified')
  183. imgs = await self.addimages(data)
  184. asyncio.Task(self.addurls([(u, url) for u in urls]))
  185. try: pr = await self.urldict(url, self.changefreq)
  186. except IndexError: pass
  187. try: cf = await self.urldict(url, self.priorities)
  188. except IndexError: pass
  189. # even if we have no exception, we can mark url as good
  190. resp.close()
  191. self.done[url] = [True, lastmod, cf, pr, imgs]
  192. self.busy.remove(url)
  193. logging.info(len(self.done), 'completed tasks,', len(self.tasks),
  194. 'still pending, todo_queue', len(self.todo_queue))