diff --git a/pysitemap/__init__.py b/pysitemap/__init__.py index afbfe71..66df066 100644 --- a/pysitemap/__init__.py +++ b/pysitemap/__init__.py @@ -1,20 +1,34 @@ -import asyncio +kimport asyncio import signal from pysitemap.base_crawler import Crawler -def crawler(root_url, out_file, out_format='xml', maxtasks=100): +def crawler( + root_url, out_file, out_format='xml', + maxtasks=10, exclude_urls=[], verifyssl=True, + headers=None, timezone_offset=0, changefreq=None, + priorities=None): """ run crowler :param root_url: Site root url :param out_file: path to the out file :param out_format: format of out file [xml, txt] :param maxtasks: max count of tasks + :param exclude_urls: excludable url paths + :param verifyssl: verify website certificate? + :param headers: Send these headers in every request + :param timezone_offset: timezone offset for lastmod tags + :param changefreq: dictionary, where key is site sub url regex, and value is changefreq + :param priorities: dictionary, where key is site sub url regex, and value is priority float :return: """ loop = asyncio.get_event_loop() - c = Crawler(root_url, out_file=out_file, out_format=out_format, maxtasks=maxtasks) + c = Crawler(root_url, out_file=out_file, out_format=out_format, + maxtasks=maxtasks, exclude_urls=exclude_urls, verifyssl=verifyssl, + headers=headers, timezone_offset=timezone_offset, + changefreq=changefreq, priorities=priorities) + loop.run_until_complete(c.run()) try: @@ -23,5 +37,5 @@ def crawler(root_url, out_file, out_format='xml', maxtasks=100): pass print('todo_queue:', len(c.todo_queue)) print('busy:', len(c.busy)) - print('done:', len(c.done), '; ok:', sum(c.done.values())) - print('tasks:', len(c.tasks)) \ No newline at end of file + print('done:', len(c.done), '; ok:', sum(list(zip(*c.done.values()))[0]) ) + print('tasks:', len(c.tasks)) diff --git a/pysitemap/base_crawler.py b/pysitemap/base_crawler.py index d282c32..cf54c79 100644 --- a/pysitemap/base_crawler.py +++ b/pysitemap/base_crawler.py @@ -1,4 +1,4 @@ -import logging +kimport logging import asyncio import re import urllib.parse @@ -14,7 +14,8 @@ class Crawler: 'txt': TextWriter } - def __init__(self, rooturl, out_file, out_format='xml', maxtasks=100, + def __init__(self, rooturl, out_file, out_format='xml', maxtasks=10, exclude_urls=[], verifyssl=True, + headers=None, timezone_offset=0, changefreq=None, priorities=None, todo_queue_backend=set, done_backend=dict): """ Crawler constructor @@ -24,18 +25,35 @@ class Crawler: :type out_file: str :param out_format: sitemap type [xml | txt]. Default xml :type out_format: str - :param maxtasks: maximum count of tasks. Default 100 + :param maxtasks: maximum count of tasks. Default 10 :type maxtasks: int + :param exclude_urls: excludable url paths relative to root url + :type exclude_urls: list + :param verifyssl: verify website certificate? + :type verifyssl: boolean + :param timezone_offset: timezone offset for lastmod tags + :type timezone_offset: int + :param changefreq: dictionary, where key is site sub url regex, and value is changefreq + :type changefreq: dict + :param priorities: dictionary, where key is site sub url regex, and value is priority float + :type priorities: dict """ self.rooturl = rooturl + self.exclude_urls = exclude_urls self.todo_queue = todo_queue_backend() self.busy = set() self.done = done_backend() self.tasks = set() self.sem = asyncio.Semaphore(maxtasks) + self.timezone_offset = timezone_offset + self.changefreq = changefreq + self.priorities = priorities # connector stores cookies between requests and uses connection pool - self.session = aiohttp.ClientSession() + self.session = aiohttp.ClientSession( + headers=headers, + connector=aiohttp.TCPConnector(verify_ssl=verifyssl) + ) self.writer = self.format_processors.get(out_format)(out_file) async def run(self): @@ -50,7 +68,29 @@ class Crawler: await t await self.session.close() - await self.writer.write([key for key, value in self.done.items() if value]) + await self.writer.write([(key, value) for key, value in self.done.items() if key and value], self.timezone_offset) + + async def contains(self, url, regex, rlist=True): + """ + Does url path matches a value in regex_list? + """ + retvalue = False + if rlist: + for exc in regex: + retvalue = bool(re.search(re.compile(r"{}".format(exc)), url)) + if retvalue: return retvalue + else: + retvalue = bool(re.search(re.compile(r"{}".format(regex)), url)) + return retvalue + + async def urldict(self, url, url_dict): + """ + Parse URL regex (key) and value pairs + """ + for urlkey, regvalue in url_dict.items(): + if await self.contains(url, urlkey, rlist=False): + return regvalue + return None async def addurls(self, urls): """ @@ -61,7 +101,9 @@ class Crawler: for url, parenturl in urls: url = urllib.parse.urljoin(parenturl, url) url, frag = urllib.parse.urldefrag(url) + if (url.startswith(self.rooturl) and + not await self.contains(url, self.exclude_urls, rlist=True) and url not in self.busy and url not in self.done and url not in self.todo_queue): @@ -89,26 +131,37 @@ class Crawler: self.todo_queue.remove(url) self.busy.add(url) + lastmod = None + cf = None + pr = None + try: resp = await self.session.get(url) # await response except Exception as exc: # on any exception mark url as BAD print('...', url, 'has error', repr(str(exc))) - self.done[url] = False + self.done[url] = [False, lastmod, cf, pr] else: # only url with status == 200 and content type == 'text/html' parsed if (resp.status == 200 and ('text/html' in resp.headers.get('content-type'))): data = (await resp.read()).decode('utf-8', 'replace') urls = re.findall(r'(?i)href=["\']?([^\s"\'<>]+)', data) + lastmod = resp.headers.get('last-modified') + asyncio.Task(self.addurls([(u, url) for u in urls])) + try: pr = await self.urldict(url, self.changefreq) + except IndexError: pass + + try: cf = await self.urldict(url, self.priorities) + except IndexError: pass + # even if we have no exception, we can mark url as good resp.close() - self.done[url] = True + + self.done[url] = [True, lastmod, cf, pr] self.busy.remove(url) logging.info(len(self.done), 'completed tasks,', len(self.tasks), 'still pending, todo_queue', len(self.todo_queue)) - - diff --git a/pysitemap/format_processors/xml.py b/pysitemap/format_processors/xml.py index 9446bb4..300394c 100644 --- a/pysitemap/format_processors/xml.py +++ b/pysitemap/format_processors/xml.py @@ -1,14 +1,14 @@ import asyncio from aiofile import AIOFile, Reader, Writer import logging - +from datetime import datetime, timezone, timedelta class XMLWriter(): def __init__(self, filename: str): self.filename = filename - async def write(self, urls): + async def write(self, urls, timezone_offset): async with AIOFile(self.filename, 'w') as aiodf: writer = Writer(aiodf) await writer('\n') @@ -17,10 +17,26 @@ class XMLWriter(): ' xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"' ' xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd">\n') await aiodf.fsync() - for url in urls: - await writer('{}\n'.format(url)) + for data in urls: + + timestamp = data[1][1] + changefreq = data[1][2] + priority = data[1][3] + url = "{}".format(data[0]) + + if timestamp is not None: + timestamp = datetime.strptime(timestamp, "%a, %d %b %Y %H:%M:%S %Z").astimezone(tz=timezone(timedelta(hours=timezone_offset))).isoformat() + url += "{}".format(str(timestamp)) + + if changefreq is not None: + url += "{}".format(str(changefreq)) + + if priority is not None: + url += "{}".format(str(priority)) + + await writer('{}\n'.format(url)) + await aiodf.fsync() await writer('') await aiodf.fsync() -