Sitemap generator
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

321 lines
12 KiB

4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
  1. import logging
  2. import asyncio
  3. import re
  4. import urllib.parse
  5. from pysitemap.format_processors.xml import XMLWriter
  6. from pysitemap.format_processors.text import TextWriter
  7. import aiohttp
  8. class Crawler:
  9. format_processors = {
  10. 'xml': XMLWriter,
  11. 'txt': TextWriter
  12. }
  13. def __init__(self, rooturl, out_file, out_format='xml', maxtasks=10, exclude_urls=[], exclude_imgs=[],
  14. image_root_urls=[], use_lastmodified=True, verifyssl=True, findimages=True, images_this_domain=True,
  15. headers=None, timezone_offset=0, changefreq=None, priorities=None, todo_queue_backend=set,
  16. done_backend=dict, done_images=list):
  17. """
  18. Crawler constructor
  19. :param rooturl: root url of site
  20. :type rooturl: str
  21. :param out_file: file to save sitemap result
  22. :type out_file: str
  23. :param out_format: sitemap type [xml | txt]. Default xml
  24. :type out_format: str
  25. :param maxtasks: maximum count of tasks. Default 10
  26. :type maxtasks: int
  27. :param exclude_urls: excludable url paths relative to root url
  28. :type exclude_urls: list
  29. :param exclude_imgs: excludable img url paths relative to root url
  30. :type exclude_imgs: list
  31. :param image_root_urls: recognized image root urls on the domain
  32. :type image_root_urls: list
  33. :param use_lastmodified: enable or disable timestamps for fetched urls?
  34. :type use_lastmodified: bool
  35. :param verifyssl: verify website certificate?
  36. :type verifyssl: bool
  37. :param findimages: Find images references?
  38. :type findimages: bool
  39. :param images_this_domain: Find images which refer to this domain only?
  40. :type images_this_domain: bool
  41. :param timezone_offset: timezone offset for lastmod tags
  42. :type timezone_offset: int
  43. :param changefreq: dictionary, where key is site sub url regex, and value is changefreq
  44. :type changefreq: dict
  45. :param priorities: dictionary, where key is site sub url regex, and value is priority float
  46. :type priorities: dict
  47. """
  48. self.rooturl = rooturl
  49. self.exclude_urls = exclude_urls
  50. self.exclude_imgs = exclude_imgs
  51. self.use_lastmodified = use_lastmodified
  52. self.image_root_urls = image_root_urls
  53. self.findimages = findimages
  54. self.images_this_domain = images_this_domain
  55. self.todo_queue = todo_queue_backend()
  56. self.busy = set()
  57. self.done = done_backend()
  58. self.done_images = done_images()
  59. self.tasks = set()
  60. self.sem = asyncio.Semaphore(maxtasks)
  61. self.timezone_offset = timezone_offset
  62. self.changefreq = changefreq
  63. self.priorities = priorities
  64. # connector stores cookies between requests and uses connection pool
  65. self.session = aiohttp.ClientSession(
  66. headers=headers,
  67. connector=aiohttp.TCPConnector(verify_ssl=verifyssl)
  68. )
  69. self.writer = self.format_processors.get(out_format)(out_file)
  70. async def run(self):
  71. """
  72. Main function to start parsing site
  73. :return:
  74. """
  75. t = asyncio.ensure_future(self.addurls([(self.rooturl, '')]))
  76. await asyncio.sleep(1)
  77. while self.busy:
  78. await asyncio.sleep(1)
  79. await t
  80. await self.session.close()
  81. await self.writer.write([(key, value) for key, value in self.done.items() if key and value], self.timezone_offset)
  82. async def contains(self, url, regex, rlist=True):
  83. """
  84. Does url path matches a value in regex_list?
  85. """
  86. retvalue = False
  87. if rlist:
  88. for exc in regex:
  89. retvalue = bool(re.search(re.compile(r"{}".format(exc)), url))
  90. if retvalue: return retvalue
  91. else:
  92. retvalue = bool(re.search(re.compile(r"{}".format(regex)), url))
  93. return retvalue
  94. async def urldict(self, url, url_dict):
  95. """
  96. Parse URL regex (key) and value pairs
  97. """
  98. for urlkey, regvalue in url_dict.items():
  99. if await self.contains(url, urlkey, rlist=False):
  100. return regvalue
  101. return None
  102. async def addurls(self, urls):
  103. """
  104. Add urls in queue and run process to parse
  105. :param urls:
  106. :return:
  107. """
  108. for url, parenturl in urls:
  109. url = urllib.parse.urljoin(parenturl, url)
  110. url, frag = urllib.parse.urldefrag(url)
  111. if (url.startswith(self.rooturl) and
  112. not await self.contains(url, self.exclude_urls, rlist=True) and
  113. url not in self.busy and
  114. url not in self.done and
  115. url not in self.todo_queue):
  116. self.todo_queue.add(url)
  117. # Acquire semaphore
  118. await self.sem.acquire()
  119. # Create async task
  120. task = asyncio.ensure_future(self.process(url))
  121. # Add collback into task to release semaphore
  122. task.add_done_callback(lambda t: self.sem.release())
  123. # Callback to remove task from tasks
  124. task.add_done_callback(self.tasks.remove)
  125. # Add task into tasks
  126. self.tasks.add(task)
  127. async def mimechecker(self, url, expected):
  128. """
  129. Check url resource mimetype
  130. """
  131. self.todo_queue.remove(url)
  132. self.busy.add(url)
  133. try:
  134. resp = await self.session.get(url)
  135. except Exception as exc:
  136. pass
  137. else:
  138. mime = resp.headers.get('content-type')
  139. if (resp.status == 200 and
  140. bool(re.search(re.compile(r'{}'.format(expected)), mime))):
  141. resp.close()
  142. self.busy.remove(url)
  143. return True
  144. resp.close()
  145. self.busy.remove(url)
  146. return False
  147. async def fetchtags(self, data, url, tag_input, fields=[]):
  148. """
  149. Find and sort all target tags from website data
  150. """
  151. tags = []
  152. lines_join = []
  153. for line in data.split('\n'):
  154. lines_join.append(line)
  155. tags_raw = re.findall(re.compile(r'<{}.*?>'.format(tag_input)), ' '.join(lines_join))
  156. for tag_raw in tags_raw:
  157. tag_raw = re.sub(re.compile(r'<{}(.*?)>'.format(tag_input)), '\\1', tag_raw)
  158. # Regex lookahead + lookbehind
  159. # Find patterns, where pattern start with "<word>=" and ends with " <word>="
  160. # Include the first pattern, which will be used to determine
  161. # value which the pattern holds in it
  162. # TODO Note: this method is error-prone, since it assumes that...
  163. # ... no argument value inside <img ... /> tag has value of "<somechar>="
  164. # If this happens, args regex findall & splitting (below) fails.
  165. args_raw = re.findall(r'(?i)(?=[\w]+[=]|[\w\"\'])(.*?)(?=\s[\w]+[=])', tag_raw)
  166. tag = []
  167. for arg_raw in args_raw:
  168. arg = arg_raw.split('=')
  169. if len(arg) != 2:
  170. print("warning: failure on tag data parsing operation.")
  171. continue
  172. arg_dict = {}
  173. key = arg[0]
  174. # Remove leading and trailing quote marks from value
  175. value = re.sub(r'^["\']?(.*?)["\']?$', '\\1', arg[1])
  176. value = re.sub(r'&', '&amp;', value)
  177. for field in fields:
  178. if key == field:
  179. arg_dict[field] = value
  180. # else:
  181. # print("warning: ignoring tag data value:", key)
  182. if len(arg_dict) == 1:
  183. tags.append(arg_dict)
  184. return tags
  185. async def addtagdata(self, tagdata, url, source_url_field,
  186. mimetype, tag_root_urls=[], excludes=[],
  187. done_list=[], this_domain=True):
  188. """
  189. Validate existence of url in given tagdata
  190. :return: dictionary of validated tags (of single type)
  191. """
  192. tags = []
  193. for tag in tagdata:
  194. if not source_url_field in tag:
  195. continue
  196. if not await self.contains(tag[source_url_field], excludes, rlist=True):
  197. if this_domain:
  198. if not tag[source_url_field].startswith('http'):
  199. for tag_root_url in tag_root_urls:
  200. if url.startswith(tag_root_url):
  201. tag[source_url_field] = tag_root_url + tag[source_url_field]
  202. break
  203. else:
  204. if not tag[source_url_field].startswith('http'):
  205. continue
  206. if (tag[source_url_field].startswith('http') and
  207. tag not in done_list and
  208. tag[source_url_field] not in self.busy and
  209. tag[source_url_field] not in self.todo_queue):
  210. self.todo_queue.add(tag[source_url_field])
  211. # Acquire semaphore
  212. await self.sem.acquire()
  213. # Create async task
  214. task = asyncio.ensure_future(self.mimechecker(tag[source_url_field], mimetype))
  215. # Add collback into task to release semaphore
  216. task.add_done_callback(lambda t: self.sem.release())
  217. # Callback to remove task from tasks
  218. task.add_done_callback(self.tasks.remove)
  219. # Add task into tasks
  220. self.tasks.add(task)
  221. try:
  222. result = await asyncio.wait_for(task, timeout=20)
  223. if (result):
  224. tags.append(tag)
  225. except asyncio.TimeoutError:
  226. print("couldn't add tag data:", tag[source_url_field])
  227. task.cancel()
  228. pass
  229. done_list.extend(tags)
  230. return tags
  231. async def process(self, url):
  232. """
  233. Process single url
  234. :param url:
  235. :return:
  236. """
  237. print('processing:', url)
  238. # remove url from basic queue and add it into busy list
  239. self.todo_queue.remove(url)
  240. self.busy.add(url)
  241. lastmod = None
  242. cf = None
  243. pr = None
  244. imgs = []
  245. try:
  246. resp = await self.session.get(url) # await response
  247. except Exception as exc:
  248. # on any exception mark url as BAD
  249. print('...', url, 'has error', repr(str(exc)))
  250. self.done[url] = [False, lastmod, cf, pr, imgs]
  251. else:
  252. # only url with status == 200 and content type == 'text/html' parsed
  253. if (resp.status == 200 and
  254. ('text/html' in resp.headers.get('content-type'))):
  255. data = (await resp.read()).decode('utf-8', 'replace')
  256. urls = re.findall(r'(?i)href=["\']?([^\s"\'<>]+)', data)
  257. if self.use_lastmodified:
  258. lastmod = resp.headers.get('last-modified')
  259. if self.findimages:
  260. # Ref: https://support.google.com/webmasters/answer/178636?hl=en
  261. img_data = await self.fetchtags(
  262. data, url, 'img',
  263. fields=['src', 'title', 'caption', 'geo_location', 'license']
  264. )
  265. imgs = await self.addtagdata(
  266. tagdata=img_data, url=url,
  267. source_url_field='src', mimetype='^image\/',
  268. tag_root_urls=self.image_root_urls,
  269. excludes=self.exclude_imgs,
  270. done_list=self.done_images,
  271. this_domain=self.images_this_domain
  272. )
  273. asyncio.Task(self.addurls([(u, url) for u in urls]))
  274. try: pr = await self.urldict(url, self.changefreq)
  275. except IndexError: pass
  276. try: cf = await self.urldict(url, self.priorities)
  277. except IndexError: pass
  278. # even if we have no exception, we can mark url as good
  279. resp.close()
  280. self.done[url] = [True, lastmod, cf, pr, imgs]
  281. self.busy.remove(url)
  282. logging.info(len(self.done), 'completed tasks,', len(self.tasks),
  283. 'still pending, todo_queue', len(self.todo_queue))