Sitemap generator
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

312 lines
12 KiB

4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
  1. import logging
  2. import asyncio
  3. import re
  4. import urllib.parse
  5. from pysitemap.format_processors.xml import XMLWriter
  6. from pysitemap.format_processors.text import TextWriter
  7. import aiohttp
  8. class Crawler:
  9. format_processors = {
  10. 'xml': XMLWriter,
  11. 'txt': TextWriter
  12. }
  13. def __init__(self, rooturl, out_file, out_format='xml', maxtasks=10, exclude_urls=[], exclude_imgs=[],
  14. image_root_urls=[], verifyssl=True, headers=None, timezone_offset=0, changefreq=None, priorities=None,
  15. todo_queue_backend=set, done_backend=dict, done_images=list):
  16. """
  17. Crawler constructor
  18. :param rooturl: root url of site
  19. :type rooturl: str
  20. :param out_file: file to save sitemap result
  21. :type out_file: str
  22. :param out_format: sitemap type [xml | txt]. Default xml
  23. :type out_format: str
  24. :param maxtasks: maximum count of tasks. Default 10
  25. :type maxtasks: int
  26. :param exclude_urls: excludable url paths relative to root url
  27. :type exclude_urls: list
  28. :param exclude_imgs: excludable img url paths relative to root url
  29. :type exclude_imgs: list
  30. :param image_root_urls: recognized image root urls on the domain
  31. :type image_root_urls: list
  32. :param verifyssl: verify website certificate?
  33. :type verifyssl: boolean
  34. :param timezone_offset: timezone offset for lastmod tags
  35. :type timezone_offset: int
  36. :param changefreq: dictionary, where key is site sub url regex, and value is changefreq
  37. :type changefreq: dict
  38. :param priorities: dictionary, where key is site sub url regex, and value is priority float
  39. :type priorities: dict
  40. """
  41. self.rooturl = rooturl
  42. self.exclude_urls = exclude_urls
  43. self.exclude_imgs = exclude_imgs
  44. self.image_root_urls = image_root_urls
  45. self.todo_queue = todo_queue_backend()
  46. self.busy = set()
  47. self.done = done_backend()
  48. self.done_images = done_images()
  49. self.tasks = set()
  50. self.sem = asyncio.Semaphore(maxtasks)
  51. self.timezone_offset = timezone_offset
  52. self.changefreq = changefreq
  53. self.priorities = priorities
  54. # connector stores cookies between requests and uses connection pool
  55. self.session = aiohttp.ClientSession(
  56. headers=headers,
  57. connector=aiohttp.TCPConnector(verify_ssl=verifyssl)
  58. )
  59. self.writer = self.format_processors.get(out_format)(out_file)
  60. async def run(self):
  61. """
  62. Main function to start parsing site
  63. :return:
  64. """
  65. t = asyncio.ensure_future(self.addurls([(self.rooturl, '')]))
  66. await asyncio.sleep(1)
  67. while self.busy:
  68. await asyncio.sleep(1)
  69. await t
  70. await self.session.close()
  71. await self.writer.write([(key, value) for key, value in self.done.items() if key and value], self.timezone_offset)
  72. async def contains(self, url, regex, rlist=True):
  73. """
  74. Does url path matches a value in regex_list?
  75. """
  76. retvalue = False
  77. if rlist:
  78. for exc in regex:
  79. retvalue = bool(re.search(re.compile(r"{}".format(exc)), url))
  80. if retvalue: return retvalue
  81. else:
  82. retvalue = bool(re.search(re.compile(r"{}".format(regex)), url))
  83. return retvalue
  84. async def urldict(self, url, url_dict):
  85. """
  86. Parse URL regex (key) and value pairs
  87. """
  88. for urlkey, regvalue in url_dict.items():
  89. if await self.contains(url, urlkey, rlist=False):
  90. return regvalue
  91. return None
  92. async def addurls(self, urls):
  93. """
  94. Add urls in queue and run process to parse
  95. :param urls:
  96. :return:
  97. """
  98. for url, parenturl in urls:
  99. url = urllib.parse.urljoin(parenturl, url)
  100. url, frag = urllib.parse.urldefrag(url)
  101. if (url.startswith(self.rooturl) and
  102. not await self.contains(url, self.exclude_urls, rlist=True) and
  103. url not in self.busy and
  104. url not in self.done and
  105. url not in self.todo_queue):
  106. self.todo_queue.add(url)
  107. # Acquire semaphore
  108. await self.sem.acquire()
  109. # Create async task
  110. task = asyncio.ensure_future(self.process(url))
  111. # Add collback into task to release semaphore
  112. task.add_done_callback(lambda t: self.sem.release())
  113. # Callback to remove task from tasks
  114. task.add_done_callback(self.tasks.remove)
  115. # Add task into tasks
  116. self.tasks.add(task)
  117. async def mimechecker(self, url, expected):
  118. """
  119. Check url resource mimetype
  120. """
  121. self.todo_queue.remove(url)
  122. self.busy.add(url)
  123. try:
  124. resp = await self.session.get(url)
  125. except Exception as exc:
  126. pass
  127. else:
  128. mime = resp.headers.get('content-type')
  129. if (resp.status == 200 and
  130. bool(re.search(re.compile(r'{}'.format(expected)), mime))):
  131. resp.close()
  132. self.busy.remove(url)
  133. return True
  134. resp.close()
  135. self.busy.remove(url)
  136. return False
  137. async def fetchtags(self, data, url, tag_input, fields=[]):
  138. """
  139. Find and sort all target tags from website data
  140. """
  141. tags = []
  142. lines_join = []
  143. for line in data.split('\n'):
  144. lines_join.append(line)
  145. tags_raw = re.findall(re.compile(r'<{}.*?>'.format(tag_input)), ' '.join(lines_join))
  146. for tag_raw in tags_raw:
  147. tag_raw = re.sub(re.compile(r'<{}(.*?)>'.format(tag_input)), '\\1', tag_raw)
  148. # Regex lookahead + lookbehind
  149. # Find patterns, where pattern start with "<word>=" and ends with " <word>="
  150. # Include the first pattern, which will be used to determine
  151. # value which the pattern holds in it
  152. # TODO Note: this method is error-prone, since it assumes that...
  153. # ... no argument value inside <img ... /> tag has value of "<somechar>="
  154. # If this happens, args regex findall & splitting (below) fails.
  155. args_raw = re.findall(r'(?i)(?=[\w]+[=]|[\w\"\'])(.*?)(?=\s[\w]+[=])', tag_raw)
  156. tag = []
  157. for arg_raw in args_raw:
  158. arg = arg_raw.split('=')
  159. if len(arg) != 2:
  160. print("warning: failure on tag data parsing operation.")
  161. continue
  162. arg_dict = {}
  163. key = arg[0]
  164. # Remove leading and trailing quote marks from value
  165. value = re.sub(r'^["\']?(.*?)["\']?$', '\\1', arg[1])
  166. value = re.sub(r'&', '&amp;', value)
  167. for field in fields:
  168. if key == field:
  169. arg_dict[field] = value
  170. # else:
  171. # print("warning: ignoring tag data value:", key)
  172. if len(arg_dict) == 1:
  173. tag.append(arg_dict)
  174. tags.append(tag)
  175. return tags
  176. async def addtagdata(self, tagdata, url, source_url_field,
  177. mimetype, tag_root_urls=[], excludes=[],
  178. done_list=[], this_domain=True):
  179. """
  180. Validate existence of url in given tagdata
  181. :return: dictionary of validated tags (of single type)
  182. """
  183. tags = []
  184. for data in tagdata:
  185. for tag in data:
  186. if not source_url_field in tag:
  187. continue
  188. if not await self.contains(tag[source_url_field], excludes, rlist=True):
  189. if this_domain:
  190. if not tag[source_url_field].startswith('http'):
  191. for tag_root_url in tag_root_urls:
  192. if url.startswith(tag_root_url):
  193. tag[source_url_field] = tag_root_url + tag[source_url_field]
  194. break
  195. else:
  196. if not tag[source_url_field].startswith('http'):
  197. continue
  198. if (tag[source_url_field].startswith('http') and
  199. data not in done_list and
  200. tag[source_url_field] not in self.busy and
  201. tag[source_url_field] not in self.todo_queue):
  202. self.todo_queue.add(tag[source_url_field])
  203. # Acquire semaphore
  204. await self.sem.acquire()
  205. # Create async task
  206. task = asyncio.ensure_future(self.mimechecker(tag[source_url_field], mimetype))
  207. # Add collback into task to release semaphore
  208. task.add_done_callback(lambda t: self.sem.release())
  209. # Callback to remove task from tasks
  210. task.add_done_callback(self.tasks.remove)
  211. # Add task into tasks
  212. self.tasks.add(task)
  213. try:
  214. result = await asyncio.wait_for(task, timeout=20)
  215. if (result):
  216. tags.append(data)
  217. except asyncio.TimeoutError:
  218. print("couldn't add tag data:", tag[source_url_field])
  219. task.cancel()
  220. pass
  221. done_list.extend(tags)
  222. return tags
  223. async def process(self, url):
  224. """
  225. Process single url
  226. :param url:
  227. :return:
  228. """
  229. print('processing:', url)
  230. # remove url from basic queue and add it into busy list
  231. self.todo_queue.remove(url)
  232. self.busy.add(url)
  233. lastmod = None
  234. cf = None
  235. pr = None
  236. imgs = []
  237. try:
  238. resp = await self.session.get(url) # await response
  239. except Exception as exc:
  240. # on any exception mark url as BAD
  241. print('...', url, 'has error', repr(str(exc)))
  242. self.done[url] = [False, lastmod, cf, pr, imgs]
  243. else:
  244. # only url with status == 200 and content type == 'text/html' parsed
  245. if (resp.status == 200 and
  246. ('text/html' in resp.headers.get('content-type'))):
  247. data = (await resp.read()).decode('utf-8', 'replace')
  248. urls = re.findall(r'(?i)href=["\']?([^\s"\'<>]+)', data)
  249. lastmod = resp.headers.get('last-modified')
  250. # Ref: https://support.google.com/webmasters/answer/178636?hl=en
  251. img_data = await self.fetchtags(
  252. data, url, 'img',
  253. fields=['src', 'title', 'caption', 'geo_location', 'license']
  254. )
  255. imgs = await self.addtagdata(
  256. tagdata=img_data, url=url,
  257. source_url_field='src', mimetype='^image\/',
  258. tag_root_urls=self.image_root_urls,
  259. excludes=self.exclude_imgs,
  260. done_list=self.done_images,
  261. this_domain=True
  262. )
  263. asyncio.Task(self.addurls([(u, url) for u in urls]))
  264. try: pr = await self.urldict(url, self.changefreq)
  265. except IndexError: pass
  266. try: cf = await self.urldict(url, self.priorities)
  267. except IndexError: pass
  268. # even if we have no exception, we can mark url as good
  269. resp.close()
  270. self.done[url] = [True, lastmod, cf, pr, imgs]
  271. self.busy.remove(url)
  272. logging.info(len(self.done), 'completed tasks,', len(self.tasks),
  273. 'still pending, todo_queue', len(self.todo_queue))