Browse Source

mechanize removed

now using requests and lxml.html (both required to install)
links with error code != 200 will be written in file errors.txt in the path ./
pysitemap-python-2.7
Kamo Petrosyan 6 years ago
parent
commit
3e5209802e
5 changed files with 31 additions and 15 deletions
  1. +3
    -0
      .gitignore
  2. +22
    -10
      pysitemap/crawler.py
  3. +2
    -2
      requirements.txt
  4. +3
    -3
      setup.py
  5. +1
    -0
      tests/__init__.py

+ 3
- 0
.gitignore View File

@ -1,3 +1,6 @@
.idea/
tests/
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]


+ 22
- 10
pysitemap/crawler.py View File

@ -1,5 +1,9 @@
# -*- coding: utf-8 -*-
import __future__
import sys
import urlparse
import mechanize
import requests
from lxml import html
import re
try:
import sys
@ -27,6 +31,7 @@ class Crawler:
self.visited = set([url])
self.exts = ['htm', 'php']
self.allowed_regex = '\.((?!htm)(?!php)\w+)$'
self.errors = {'404': []}
def set_exts(self, exts):
self.exts = exts
@ -54,6 +59,10 @@ class Crawler:
self.write_xml()
elif self.oformat == 'txt':
self.write_txt()
with open('errors.txt', 'w') as err_file:
for key, val in self.errors.items():
err_file.write(u'\n\nError {}\n\n'.format(key))
err_file.write(u'\n'.join(set(val)))
def parse_gevent(self):
self.parse()
@ -74,15 +83,21 @@ class Crawler:
return
else:
url = self.urls.pop()
br = mechanize.Browser()
try:
response = br.open(url)
if response.code >= 400:
self.errlog("Error {} at url {}".format(response.code, url))
response = requests.get(url)
# if status code is not 404, then add url in seld.errors dictionary
if response.status_code != 200:
if self.errors.get(str(response.status_code), False):
self.errors[str(response.status_code)].extend([url])
else:
self.errors.update({str(response.status_code): [url]})
self.errlog("Error {} at url {}".format(response.status_code, url))
return
for link in br.links():
newurl = urlparse.urljoin(link.base_url, link.url)
tree = html.fromstring(response.text)
for link_tag in tree.findall('.//a'):
link = link_tag.attrib.get('href', '')
newurl = urlparse.urljoin(self.url, link)
# print(newurl)
if self.is_valid(newurl):
self.visited.update([newurl])
@ -90,9 +105,6 @@ class Crawler:
except Exception, e:
self.errlog(e.message)
br.close()
del(br)
def is_valid(self, url):
if '#' in url:
url = url[:url.find('#')]


+ 2
- 2
requirements.txt View File

@ -1,2 +1,2 @@
BeautifulSoup
mechanize
lxml
requests

+ 3
- 3
setup.py View File

@ -1,7 +1,7 @@
from distutils.core import setup
from setuptools import find_packages, setup
EXCLUDE_FROM_PACKAGES = []
EXCLUDE_FROM_PACKAGES = ['tests',]
def get_version(major=0, minor=0, build=0):
@ -32,6 +32,6 @@ setup(
'Programming Language :: Python :: 2.7',
'Topic :: Software Development :: Libraries :: Python Modules',
],
install_requires=['beautifulsoup4', 'mechanize'],
requires=['beautifulsoup4', 'mechanize']
install_requires=['lxml', 'requests'],
requires=['lxml', 'requests']
)

+ 1
- 0
tests/__init__.py View File

@ -0,0 +1 @@
# -*- coding: utf-8 -*-

Loading…
Cancel
Save