diff --git a/pysitemap/crawler.py b/pysitemap/crawler.py
index 00f5ad6..228465a 100644
--- a/pysitemap/crawler.py
+++ b/pysitemap/crawler.py
@@ -1,162 +1,40 @@
-# -*- coding: utf-8 -*-
-import __future__
-import sys
-if sys.version_info.major == 2:
- import urlparse
-else:
- from urllib import parse as urlparse
-import requests
-from lxml import html
-import re
-import time
-try:
- import sys
- if 'threading' in sys.modules:
- del sys.modules['threading']
- print('threading module loaded before patching!')
- print('threading module deleted from sys.modules!\n')
- from gevent import monkey, pool
- monkey.patch_all()
- gevent_installed = True
-except:
- print("Gevent is not installed. Parsing process will be slower.")
- gevent_installed = False
-
-
-class Crawler:
- def __init__(self, url, outputfile='sitemap.xml', logfile='error.log', oformat='xml'):
- self.url = url
- self.logfile = open(logfile, 'a')
- self.oformat = oformat
- self.outputfile = outputfile
-
- # create lists for urls in queue and visited urls
- self.urls = set([url])
- self.visited = set([url])
- self.exts = ['htm', 'php']
- self.allowed_regex = '\.((?!htm)(?!php)\w+)$'
- self.errors = {'404': []}
-
- def set_exts(self, exts):
- self.exts = exts
-
- def allow_regex(self, regex=None):
- if regex is not None:
- self.allowed_regex = regex
- else:
- allowed_regex = ''
- for ext in self.exts:
- allowed_regex += '(!{})'.format(ext)
- self.allowed_regex = '\.({}\w+)$'.format(allowed_regex)
-
- def crawl(self, echo=False, pool_size=1):
- # sys.stdout.write('echo attribute deprecated and will be removed in future')
- self.echo = echo
- self.regex = re.compile(self.allowed_regex)
-
- print('Parsing pages')
- if gevent_installed and pool_size >= 1:
- self.pool = pool.Pool(pool_size)
- self.pool.spawn(self.parse_gevent)
- self.pool.join()
- else:
- self.pool = [None,] # fixing n_pool exception in self.parse with poolsize > 1 and gevent_installed == False
- while len(self.urls) > 0:
- self.parse()
- if self.oformat == 'xml':
- self.write_xml()
- elif self.oformat == 'txt':
- self.write_txt()
- with open('errors.txt', 'w') as err_file:
- for key, val in self.errors.items():
- err_file.write(u'\n\nError {}\n\n'.format(key))
- err_file.write(u'\n'.join(set(val)))
-
- def parse_gevent(self):
- self.parse()
- while len(self.urls) > 0 and not self.pool.full():
- self.pool.spawn(self.parse_gevent)
-
- def parse(self):
- if self.echo:
- n_visited, n_urls, n_pool = len(self.visited), len(self.urls), len(self.pool)
- status = (
- '{} pages parsed :: {} pages in the queue'.format(n_visited, n_urls),
- '{} pages parsed :: {} parsing processes :: {} pages in the queue'.format(n_visited, n_pool, n_urls)
- )
- print(status[int(gevent_installed)])
+#!/usr/bin/env python3
- if not self.urls:
- return
- else:
- url = self.urls.pop()
- try:
- response = requests.get(url)
- # if status code is not 404, then add url in seld.errors dictionary
- if response.status_code != 200:
- if self.errors.get(str(response.status_code), False):
- self.errors[str(response.status_code)].extend([url])
- else:
- self.errors.update({str(response.status_code): [url]})
- self.errlog("Error {} at url {}".format(response.status_code, url))
- return
-
- try:
- tree = html.fromstring(response.text)
- except ValueError as e:
- self.errlog(repr(e))
- tree = html.fromstring(response.content)
- for link_tag in tree.findall('.//a'):
- link = link_tag.attrib.get('href', '')
- newurl = urlparse.urljoin(self.url, link)
- # print(newurl)
- if self.is_valid(newurl):
- self.visited.update([newurl])
- self.urls.update([newurl])
- except Exception as e:
- self.errlog(repr(e))
-
- def is_valid(self, url):
- oldurl = url
- if '#' in url:
- url = url[:url.find('#')]
- if url in self.visited or oldurl in self.visited:
- return False
- if self.url not in url:
- return False
- if re.search(self.regex, url):
- return False
- return True
-
- def errlog(self, msg):
- self.logfile.write(msg)
- self.logfile.write('\n')
-
- def write_xml(self):
- of = open(self.outputfile, 'w')
- of.write('\n')
- of.write('\n')
- url_str = '{}\n'
- while self.visited:
- of.write(url_str.format(self.visited.pop()))
-
- of.write('')
- of.close()
-
- def write_txt(self):
- of = open(self.outputfile, 'w')
- url_str = u'{}\n'
- while self.visited:
- of.write(url_str.format(self.visited.pop()))
-
- of.close()
-
- def show_progress(self, count, total, status=''):
- bar_len = 60
- filled_len = int(round(bar_len * count / float(total)))
+import time
+import asyncio
+from aiohttp import ClientSession
+
+class Knocker(object):
+ def __init__(self, urls=None, sleep_time=.5):
+ self.urls = urls or []
+ self.sleep_time = float(sleep_time)
+
+ async def fetch(self, url, session):
+ async with session.get(url) as response:
+ await asyncio.sleep(self.sleep_time)
+ status = response.status
+ date = response.headers.get("DATE")
+ print("{}:{} with status {}".format(date, response.url, status))
+ return url, status
+
+ async def bound_fetch(self, sem, url, session):
+ # Getter function with semaphore.
+ async with sem:
+ await self.fetch(url, session)
+
+ async def run(self):
+ tasks = []
+ # create instance of Semaphore
+ sem = asyncio.Semaphore(20)
+
+ # Create client session that will ensure we dont open new connection
+ # per each request.
+ async with ClientSession() as session:
+ for url in self.urls:
+ # pass Semaphore and session to every GET request
+ task = asyncio.ensure_future(self.bound_fetch(sem, url, session))
+ tasks.append(task)
+
+ responses = asyncio.gather(*tasks)
+ await responses
- percents = round(100.0 * count / float(total), 1)
- bar = '=' * filled_len + '-' * (bar_len - filled_len)
- sys.stdout.write('[%s] %s%s ...%s\r' % (bar, percents, '%', status))
- sys.stdout.flush() # As suggested by Rom Ruben (see: http://stackoverflow.com/questions/3173320/text-progress-bar-in-the-console/27871113#comment50529068_27871113)
- time.sleep(0.5)
diff --git a/pysitemap/db.py b/pysitemap/db.py
new file mode 100644
index 0000000..704d688
--- /dev/null
+++ b/pysitemap/db.py
@@ -0,0 +1,37 @@
+from sqlalchemy import create_engine
+from sqlalchemy.ext.declarative import declarative_base
+from sqlalchemy.orm import sessionmaker, scoped_session
+from sqlalchemy.sql import ClauseElement
+
+DB_URI = 'sqlite:///stuff.db'
+
+db_endine = create_engine(DB_URI)
+
+session = scoped_session(
+ sessionmaker(
+ autocommit=False,
+ autoflush=False,
+ bind=db_endine
+ )
+)
+
+Model = declarative_base()
+
+
+def get_or_create(session, model, defaults=None, **kwargs):
+ instance = session.query(model).filter_by(**kwargs).first()
+ if instance:
+ return instance, False
+ else:
+ params = dict((k, v) for k, v in kwargs.items() if not isinstance(v, ClauseElement))
+ params.update(defaults or {})
+ instance = model(**params)
+ session.add(instance)
+ session.commit()
+ return instance, True
+
+
+
+
+
+
diff --git a/pysitemap/models.py b/pysitemap/models.py
new file mode 100644
index 0000000..b277491
--- /dev/null
+++ b/pysitemap/models.py
@@ -0,0 +1,79 @@
+from sqlalchemy import Column, Integer, String, Boolean, ForeignKey, Table, exists
+from sqlalchemy.orm import relationship
+import hashlib
+from db import Model, db_endine, session
+import uuid
+from validators import domain as domain_validator
+
+groups_domains = Table(
+ 'groups_domains',
+ Model.metadata,
+ Column('domain_id', Integer, ForeignKey('domain_groups.id')),
+ Column('group_id', Integer, ForeignKey('domains.id'))
+)
+
+
+class DomainGroup(Model):
+ __tablename__ = 'domain_groups'
+
+ id = Column(Integer, primary_key=True)
+ name = Column(String(200), nullable=False)
+ domains = relationship("Domain", secondary=groups_domains, back_populates="groups")
+
+ def __init__(self, name):
+ self.name = name
+
+ def __repr__(self):
+ return "".format(self.id, self.name)
+
+ @classmethod
+ def from_json(cls, data):
+ return cls(**data)
+
+
+class Domain(Model):
+ __tablename__ = 'domains'
+
+ id = Column(Integer, primary_key=True)
+ domain = Column(String(200), nullable=False)
+ domains = relationship("DomainGroup", secondary=groups_domains, back_populates="domains")
+
+ def __init__(self, domain):
+ self.validate_domain(domain)
+ self.domain = domain
+
+ def validate_domain(self, domain=None):
+ domain = domain or self.domain
+ return domain_validator(domain, raise_errors=True)
+
+ def __repr__(self):
+ return "".format(self.id, self.domain)
+
+ @classmethod
+ def from_json(cls, data):
+ return cls(**data)
+
+
+class User(Model):
+ __tablename__ = 'users'
+
+ id = Column(Integer, primary_key=True)
+ username = Column(String(50), unique=True)
+ token = Column(String(250), default='Unknown')
+ is_active = Column(Boolean, default=False)
+
+ def __init__(self, username: str):
+ self.username = username
+ m = hashlib.sha256()
+ m.update('{}{}'.format(username, uuid.uuid4()).encode('utf-8'))
+ self.token = m.hexdigest()
+
+ @classmethod
+ def validate(cls, username, token):
+ return session.query(cls).filter(cls.username == username, cls.token == token).count() == 1
+
+ def __repr__(self):
+ return "" % (self.name, self.fullname, self.password)
+
+
+Model.metadata.create_all(db_endine)
diff --git a/pysitemap/rest.py b/pysitemap/rest.py
new file mode 100644
index 0000000..83aa3ac
--- /dev/null
+++ b/pysitemap/rest.py
@@ -0,0 +1,133 @@
+import inspect
+from collections import OrderedDict
+
+import json
+from aiohttp.http_exceptions import HttpBadRequest
+from aiohttp.web_exceptions import HTTPMethodNotAllowed
+from aiohttp.web_request import Request
+from aiohttp.web_response import Response
+from aiohttp.web_routedef import UrlDispatcher
+
+from db import session, get_or_create
+from models import Domain, DomainGroup
+
+DEFAULT_METHODS = ('GET', 'POST', 'PUT', 'DELETE')
+
+
+class RestEndpoint:
+
+ def __init__(self):
+ self.methods = {}
+
+ for method_name in DEFAULT_METHODS:
+ method = getattr(self, method_name.lower(), None)
+ if method:
+ self.register_method(method_name, method)
+
+ def register_method(self, method_name, method):
+ self.methods[method_name.upper()] = method
+
+ async def dispatch(self, request: Request):
+ method = self.methods.get(request.method.upper())
+ if not method:
+ raise HTTPMethodNotAllowed('', DEFAULT_METHODS)
+
+ wanted_args = list(inspect.signature(method).parameters.keys())
+ available_args = request.match_info.copy()
+ available_args.update({'request': request})
+
+ unsatisfied_args = set(wanted_args) - set(available_args.keys())
+ if unsatisfied_args:
+ # Expected match info that doesn't exist
+ raise HttpBadRequest('')
+
+ return await method(**{arg_name: available_args[arg_name] for arg_name in wanted_args})
+
+
+class DomainEndpoint(RestEndpoint):
+ def __init__(self, resource):
+ super().__init__()
+ self.resource = resource
+
+ async def get(self) -> Response:
+ data = []
+
+ domains = session.query(Domain).all()
+ for instance in self.resource.collection.values():
+ data.append(self.resource.render(instance))
+
+ return Response(status=200, body=self.resource.encode({
+ 'domains': [
+ {
+ 'id': domain.id, 'title': domain.domain,
+ 'groups': [{'id': group.id, 'name': group.name} for group in domain.groups]
+ } for domain in session.query(Domain).all()
+ ]}), content_type='application/json')
+
+ async def post(self, request):
+ data = await request.json()
+ domain, _created = get_or_create(Domain, domain=data['domain'])
+
+ return Response(status=200, body=self.resource.encode(
+ {'id': domain.id, 'domain': domain.domain, 'created': _created},
+ ), content_type='application/json')
+
+
+class DomainGroupEndpoint(RestEndpoint):
+ def __init__(self, resource):
+ super().__init__()
+ self.resource = resource
+
+ async def get(self) -> Response:
+
+ return Response(status=200, body=self.resource.encode({
+ 'domain_groups': [
+ {
+ 'id': group.id, 'name': group.name,
+ 'domains': [{'id': domain.id, 'name': domain.domain} for domain in group.domains]
+ } for group in session.query(DomainGroup).all()
+ ]}), content_type='application/json')
+
+ async def post(self, request):
+ data = await request.json()
+ group, _created = get_or_create(session, DomainGroup, name=data['name'])
+ domains = []
+ if data.get('domains'):
+ for domain_el in data.get('domains'):
+ domain, _domain_created = get_or_create(session, Domain, domain=domain_el)
+ domains.append({'id': domain.id, 'domain': domain_el, 'created': _domain_created})
+
+ return Response(
+ status=200,
+ body=self.resource.encode({
+ 'id': group.id,
+ 'name': group.name,
+ 'domains': domains,
+ 'created': _created
+ }), content_type='application/json')
+
+
+class RestResource:
+ def __init__(self, notes, factory, collection, properties, id_field):
+ self.notes = notes
+ self.factory = factory
+ self.collection = collection
+ self.properties = properties
+ self.id_field = id_field
+
+ self.domain_endpoint = DomainEndpoint(self)
+ self.domain_groups_endpoint = DomainGroupEndpoint(self)
+
+ def register(self, router: UrlDispatcher):
+ router.add_route('*', '/{domains}'.format(notes=self.notes), self.domain_endpoint.dispatch)
+ router.add_route('*', '/{domain_groups}'.format(notes=self.notes), self.domain_groups_endpoint.dispatch)
+
+ def render(self, instance):
+ return OrderedDict((notes, getattr(instance, notes)) for notes in self.properties)
+
+ @staticmethod
+ def encode(data):
+ return json.dumps(data, indent=4).encode('utf-8')
+
+ def render_and_encode(self, instance):
+ return self.encode(self.render(instance))
\ No newline at end of file
diff --git a/pysitemap/validators.py b/pysitemap/validators.py
new file mode 100644
index 0000000..e1bacc0
--- /dev/null
+++ b/pysitemap/validators.py
@@ -0,0 +1,45 @@
+import re
+
+
+class ValidationFailure(BaseException):
+ """
+ class for Validation exceptions
+ """
+
+
+domain_pattern = re.compile(
+ r'^(([a-zA-Z]{1})|([a-zA-Z]{1}[a-zA-Z]{1})|'
+ r'([a-zA-Z]{1}[0-9]{1})|([0-9]{1}[a-zA-Z]{1})|'
+ r'([a-zA-Z0-9][-_.a-zA-Z0-9]{0,61}[a-zA-Z0-9]))\.'
+ r'([a-zA-Z]{2,13}|[a-zA-Z0-9-]{2,30}.[a-zA-Z]{2,3})$'
+)
+
+
+def domain(value, raise_errors=True):
+ """
+ Return whether or not given value is a valid domain.
+
+ If the value is valid domain name this function returns ``True``, otherwise
+ :class:`~validators.ValidationFailure` or False if raise_errors muted.
+
+ Examples::
+ >>> domain('example.com')
+ True
+
+ >>> domain('example.com/')
+ ValidationFailure(func=domain, ...)
+
+ Supports IDN domains as well::
+
+ >>> domain('xn----gtbspbbmkef.xn--p1ai')
+ True
+ :param value: domain string to validate
+ :param raise_errors: raise errors or return False
+ """
+ if domain_pattern.match(value) is None:
+ if raise_errors:
+ raise ValidationFailure("{} is not valid domain".format(value))
+ else:
+ return False
+ return True
+
diff --git a/pysitemap/__init__.py b/pysitemap_old/__init__.py
similarity index 100%
rename from pysitemap/__init__.py
rename to pysitemap_old/__init__.py
diff --git a/pysitemap_old/crawler.py b/pysitemap_old/crawler.py
new file mode 100644
index 0000000..00f5ad6
--- /dev/null
+++ b/pysitemap_old/crawler.py
@@ -0,0 +1,162 @@
+# -*- coding: utf-8 -*-
+import __future__
+import sys
+if sys.version_info.major == 2:
+ import urlparse
+else:
+ from urllib import parse as urlparse
+import requests
+from lxml import html
+import re
+import time
+try:
+ import sys
+ if 'threading' in sys.modules:
+ del sys.modules['threading']
+ print('threading module loaded before patching!')
+ print('threading module deleted from sys.modules!\n')
+ from gevent import monkey, pool
+ monkey.patch_all()
+ gevent_installed = True
+except:
+ print("Gevent is not installed. Parsing process will be slower.")
+ gevent_installed = False
+
+
+class Crawler:
+ def __init__(self, url, outputfile='sitemap.xml', logfile='error.log', oformat='xml'):
+ self.url = url
+ self.logfile = open(logfile, 'a')
+ self.oformat = oformat
+ self.outputfile = outputfile
+
+ # create lists for urls in queue and visited urls
+ self.urls = set([url])
+ self.visited = set([url])
+ self.exts = ['htm', 'php']
+ self.allowed_regex = '\.((?!htm)(?!php)\w+)$'
+ self.errors = {'404': []}
+
+ def set_exts(self, exts):
+ self.exts = exts
+
+ def allow_regex(self, regex=None):
+ if regex is not None:
+ self.allowed_regex = regex
+ else:
+ allowed_regex = ''
+ for ext in self.exts:
+ allowed_regex += '(!{})'.format(ext)
+ self.allowed_regex = '\.({}\w+)$'.format(allowed_regex)
+
+ def crawl(self, echo=False, pool_size=1):
+ # sys.stdout.write('echo attribute deprecated and will be removed in future')
+ self.echo = echo
+ self.regex = re.compile(self.allowed_regex)
+
+ print('Parsing pages')
+ if gevent_installed and pool_size >= 1:
+ self.pool = pool.Pool(pool_size)
+ self.pool.spawn(self.parse_gevent)
+ self.pool.join()
+ else:
+ self.pool = [None,] # fixing n_pool exception in self.parse with poolsize > 1 and gevent_installed == False
+ while len(self.urls) > 0:
+ self.parse()
+ if self.oformat == 'xml':
+ self.write_xml()
+ elif self.oformat == 'txt':
+ self.write_txt()
+ with open('errors.txt', 'w') as err_file:
+ for key, val in self.errors.items():
+ err_file.write(u'\n\nError {}\n\n'.format(key))
+ err_file.write(u'\n'.join(set(val)))
+
+ def parse_gevent(self):
+ self.parse()
+ while len(self.urls) > 0 and not self.pool.full():
+ self.pool.spawn(self.parse_gevent)
+
+ def parse(self):
+ if self.echo:
+ n_visited, n_urls, n_pool = len(self.visited), len(self.urls), len(self.pool)
+ status = (
+ '{} pages parsed :: {} pages in the queue'.format(n_visited, n_urls),
+ '{} pages parsed :: {} parsing processes :: {} pages in the queue'.format(n_visited, n_pool, n_urls)
+ )
+ print(status[int(gevent_installed)])
+
+ if not self.urls:
+ return
+ else:
+ url = self.urls.pop()
+ try:
+ response = requests.get(url)
+ # if status code is not 404, then add url in seld.errors dictionary
+ if response.status_code != 200:
+ if self.errors.get(str(response.status_code), False):
+ self.errors[str(response.status_code)].extend([url])
+ else:
+ self.errors.update({str(response.status_code): [url]})
+ self.errlog("Error {} at url {}".format(response.status_code, url))
+ return
+
+ try:
+ tree = html.fromstring(response.text)
+ except ValueError as e:
+ self.errlog(repr(e))
+ tree = html.fromstring(response.content)
+ for link_tag in tree.findall('.//a'):
+ link = link_tag.attrib.get('href', '')
+ newurl = urlparse.urljoin(self.url, link)
+ # print(newurl)
+ if self.is_valid(newurl):
+ self.visited.update([newurl])
+ self.urls.update([newurl])
+ except Exception as e:
+ self.errlog(repr(e))
+
+ def is_valid(self, url):
+ oldurl = url
+ if '#' in url:
+ url = url[:url.find('#')]
+ if url in self.visited or oldurl in self.visited:
+ return False
+ if self.url not in url:
+ return False
+ if re.search(self.regex, url):
+ return False
+ return True
+
+ def errlog(self, msg):
+ self.logfile.write(msg)
+ self.logfile.write('\n')
+
+ def write_xml(self):
+ of = open(self.outputfile, 'w')
+ of.write('\n')
+ of.write('\n')
+ url_str = '{}\n'
+ while self.visited:
+ of.write(url_str.format(self.visited.pop()))
+
+ of.write('')
+ of.close()
+
+ def write_txt(self):
+ of = open(self.outputfile, 'w')
+ url_str = u'{}\n'
+ while self.visited:
+ of.write(url_str.format(self.visited.pop()))
+
+ of.close()
+
+ def show_progress(self, count, total, status=''):
+ bar_len = 60
+ filled_len = int(round(bar_len * count / float(total)))
+
+ percents = round(100.0 * count / float(total), 1)
+ bar = '=' * filled_len + '-' * (bar_len - filled_len)
+ sys.stdout.write('[%s] %s%s ...%s\r' % (bar, percents, '%', status))
+ sys.stdout.flush() # As suggested by Rom Ruben (see: http://stackoverflow.com/questions/3173320/text-progress-bar-in-the-console/27871113#comment50529068_27871113)
+ time.sleep(0.5)
diff --git a/run.py b/run.py
index 8975bac..7997b13 100644
--- a/run.py
+++ b/run.py
@@ -1,5 +1,5 @@
-import pysitemap
-
+from pysitemap import Crawler
+import asyncio
"""
Example script
Uses gevent to implement multiprocessing if Gevent installed
@@ -8,9 +8,12 @@ To install gevent:
"""
if __name__ == '__main__':
- url = 'http://www.lumpro.ru/' # url from to crawl
+ url = 'http://www.stroivopros.ru/' # url from to crawl
logfile = 'errlog.log' # path to logfile
oformat = 'xml' # output format
outputfile = 'sitemap.xml' # path to output file
- crawl = pysitemap.Crawler(url=url, logfile=logfile, oformat=oformat, outputfile=outputfile)
- crawl.crawl(pool_size=20, echo=True)
+ loop = asyncio.get_event_loop()
+ crawler = Crawler(url=url, logfile=logfile, oformat=oformat, outputfile=outputfile)
+ future = asyncio.ensure_future(crawler.crawl(echo=True))
+ loop.run_until_complete(future)
+