From 3e5209802e4873acdca96b57d2010948a4ebdbd4 Mon Sep 17 00:00:00 2001 From: Kamo Petrosyan Date: Sat, 13 Jan 2018 21:35:27 +0300 Subject: [PATCH] mechanize removed now using requests and lxml.html (both required to install) links with error code != 200 will be written in file errors.txt in the path ./ --- .gitignore | 3 +++ pysitemap/crawler.py | 32 ++++++++++++++++++++++---------- requirements.txt | 4 ++-- setup.py | 6 +++--- tests/__init__.py | 1 + 5 files changed, 31 insertions(+), 15 deletions(-) create mode 100644 tests/__init__.py diff --git a/.gitignore b/.gitignore index a1e8ffd..6b8669e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ +.idea/ +tests/ + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/pysitemap/crawler.py b/pysitemap/crawler.py index fb0b783..fb8b54d 100644 --- a/pysitemap/crawler.py +++ b/pysitemap/crawler.py @@ -1,5 +1,9 @@ +# -*- coding: utf-8 -*- +import __future__ +import sys import urlparse -import mechanize +import requests +from lxml import html import re try: import sys @@ -27,6 +31,7 @@ class Crawler: self.visited = set([url]) self.exts = ['htm', 'php'] self.allowed_regex = '\.((?!htm)(?!php)\w+)$' + self.errors = {'404': []} def set_exts(self, exts): self.exts = exts @@ -54,6 +59,10 @@ class Crawler: self.write_xml() elif self.oformat == 'txt': self.write_txt() + with open('errors.txt', 'w') as err_file: + for key, val in self.errors.items(): + err_file.write(u'\n\nError {}\n\n'.format(key)) + err_file.write(u'\n'.join(set(val))) def parse_gevent(self): self.parse() @@ -74,15 +83,21 @@ class Crawler: return else: url = self.urls.pop() - br = mechanize.Browser() try: - response = br.open(url) - if response.code >= 400: - self.errlog("Error {} at url {}".format(response.code, url)) + response = requests.get(url) + # if status code is not 404, then add url in seld.errors dictionary + if response.status_code != 200: + if self.errors.get(str(response.status_code), False): + self.errors[str(response.status_code)].extend([url]) + else: + self.errors.update({str(response.status_code): [url]}) + self.errlog("Error {} at url {}".format(response.status_code, url)) return - for link in br.links(): - newurl = urlparse.urljoin(link.base_url, link.url) + tree = html.fromstring(response.text) + for link_tag in tree.findall('.//a'): + link = link_tag.attrib.get('href', '') + newurl = urlparse.urljoin(self.url, link) # print(newurl) if self.is_valid(newurl): self.visited.update([newurl]) @@ -90,9 +105,6 @@ class Crawler: except Exception, e: self.errlog(e.message) - br.close() - del(br) - def is_valid(self, url): if '#' in url: url = url[:url.find('#')] diff --git a/requirements.txt b/requirements.txt index 74ddd39..a3596c0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,2 @@ -BeautifulSoup -mechanize +lxml +requests diff --git a/setup.py b/setup.py index 6d59d45..77bbaf1 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from distutils.core import setup from setuptools import find_packages, setup -EXCLUDE_FROM_PACKAGES = [] +EXCLUDE_FROM_PACKAGES = ['tests',] def get_version(major=0, minor=0, build=0): @@ -32,6 +32,6 @@ setup( 'Programming Language :: Python :: 2.7', 'Topic :: Software Development :: Libraries :: Python Modules', ], - install_requires=['beautifulsoup4', 'mechanize'], - requires=['beautifulsoup4', 'mechanize'] + install_requires=['lxml', 'requests'], + requires=['lxml', 'requests'] ) \ No newline at end of file diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..7c68785 --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1 @@ +# -*- coding: utf-8 -*- \ No newline at end of file