Sitemap generator
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

157 lines
5.7 KiB

9 years ago
8 years ago
9 years ago
9 years ago
9 years ago
9 years ago
9 years ago
9 years ago
9 years ago
9 years ago
8 years ago
9 years ago
9 years ago
9 years ago
9 years ago
9 years ago
9 years ago
9 years ago
8 years ago
8 years ago
9 years ago
9 years ago
9 years ago
9 years ago
8 years ago
9 years ago
9 years ago
9 years ago
9 years ago
9 years ago
9 years ago
9 years ago
6 years ago
8 years ago
  1. # -*- coding: utf-8 -*-
  2. import __future__
  3. import sys
  4. if sys.version_info.major == 2:
  5. import urlparse
  6. else:
  7. from urllib import parse as urlparse
  8. import requests
  9. from lxml import html
  10. import re
  11. import time
  12. try:
  13. import sys
  14. if 'threading' in sys.modules:
  15. del sys.modules['threading']
  16. print('threading module loaded before patching!')
  17. print('threading module deleted from sys.modules!\n')
  18. from gevent import monkey, pool
  19. monkey.patch_all()
  20. gevent_installed = True
  21. except:
  22. print("Gevent does not installed. Parsing process will be slower.")
  23. gevent_installed = False
  24. class Crawler:
  25. def __init__(self, url, outputfile='sitemap.xml', logfile='error.log', oformat='xml'):
  26. self.url = url
  27. self.logfile = open(logfile, 'a')
  28. self.oformat = oformat
  29. self.outputfile = outputfile
  30. # create lists for the urls in que and visited urls
  31. self.urls = set([url])
  32. self.visited = set([url])
  33. self.exts = ['htm', 'php']
  34. self.allowed_regex = '\.((?!htm)(?!php)\w+)$'
  35. self.errors = {'404': []}
  36. def set_exts(self, exts):
  37. self.exts = exts
  38. def allow_regex(self, regex=None):
  39. if regex is not None:
  40. self.allowed_regex = regex
  41. else:
  42. allowed_regex = ''
  43. for ext in self.exts:
  44. allowed_regex += '(!{})'.format(ext)
  45. self.allowed_regex = '\.({}\w+)$'.format(allowed_regex)
  46. def crawl(self, echo=False, pool_size=1):
  47. # sys.stdout.write('echo attribute deprecated and will be removed in future')
  48. self.echo = echo
  49. self.regex = re.compile(self.allowed_regex)
  50. print('Parsing pages')
  51. if gevent_installed and pool_size >= 1:
  52. self.pool = pool.Pool(pool_size)
  53. self.pool.spawn(self.parse_gevent)
  54. self.pool.join()
  55. else:
  56. self.pool = [None,] # fixing n_poll exception in self.parse with poolsize > 1 and gevent_installed == False
  57. while len(self.urls) > 0:
  58. self.parse()
  59. if self.oformat == 'xml':
  60. self.write_xml()
  61. elif self.oformat == 'txt':
  62. self.write_txt()
  63. with open('errors.txt', 'w') as err_file:
  64. for key, val in self.errors.items():
  65. err_file.write(u'\n\nError {}\n\n'.format(key))
  66. err_file.write(u'\n'.join(set(val)))
  67. def parse_gevent(self):
  68. self.parse()
  69. while len(self.urls) > 0 and not self.pool.full():
  70. self.pool.spawn(self.parse_gevent)
  71. def parse(self):
  72. if self.echo:
  73. n_visited, n_urls, n_pool = len(self.visited), len(self.urls), len(self.pool)
  74. status = (
  75. '{} pages parsed :: {} pages in the queue'.format(n_visited, n_urls),
  76. '{} pages parsed :: {} parsing processes :: {} pages in the queue'.format(n_visited, n_pool, n_urls)
  77. )
  78. print(status[int(gevent_installed)])
  79. if not self.urls:
  80. return
  81. else:
  82. url = self.urls.pop()
  83. try:
  84. response = requests.get(url)
  85. # if status code is not 404, then add url in seld.errors dictionary
  86. if response.status_code != 200:
  87. if self.errors.get(str(response.status_code), False):
  88. self.errors[str(response.status_code)].extend([url])
  89. else:
  90. self.errors.update({str(response.status_code): [url]})
  91. self.errlog("Error {} at url {}".format(response.status_code, url))
  92. return
  93. tree = html.fromstring(response.text)
  94. for link_tag in tree.findall('.//a'):
  95. link = link_tag.attrib.get('href', '')
  96. newurl = urlparse.urljoin(self.url, link)
  97. # print(newurl)
  98. if self.is_valid(newurl):
  99. self.visited.update([newurl])
  100. self.urls.update([newurl])
  101. except Exception as e:
  102. self.errlog(repr(e))
  103. def is_valid(self, url):
  104. if '#' in url:
  105. url = url[:url.find('#')]
  106. if url in self.visited:
  107. return False
  108. if self.url not in url:
  109. return False
  110. if re.search(self.regex, url):
  111. return False
  112. return True
  113. def errlog(self, msg):
  114. self.logfile.write(msg)
  115. self.logfile.write('\n')
  116. def write_xml(self):
  117. of = open(self.outputfile, 'w')
  118. of.write('<?xml version="1.0" encoding="utf-8"?>\n')
  119. of.write('<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd">\n')
  120. url_str = '<url><loc>{}</loc></url>\n'
  121. while self.visited:
  122. of.write(url_str.format(self.visited.pop()))
  123. of.write('</urlset>')
  124. of.close()
  125. def write_txt(self):
  126. of = open(self.outputfile, 'w')
  127. url_str = u'{}\n'
  128. while self.visited:
  129. of.write(url_str.format(self.visited.pop()))
  130. of.close()
  131. def show_progress(self, count, total, status=''):
  132. bar_len = 60
  133. filled_len = int(round(bar_len * count / float(total)))
  134. percents = round(100.0 * count / float(total), 1)
  135. bar = '=' * filled_len + '-' * (bar_len - filled_len)
  136. sys.stdout.write('[%s] %s%s ...%s\r' % (bar, percents, '%', status))
  137. sys.stdout.flush() # As suggested by Rom Ruben (see: http://stackoverflow.com/questions/3173320/text-progress-bar-in-the-console/27871113#comment50529068_27871113)
  138. time.sleep(0.5)