Add v.0.9.3 features

5 years ago · 2d232c6b09
--- a/pysitemap/init.py
+++ b/pysitemap/init.py
@ -1,20 +1,34 @@
 import asyncio
 kimport asyncio
 import signal
 from pysitemap.base_crawler import Crawler


 def crawler(root_url, out_file, out_format='xml', maxtasks=100):
 def crawler(
    root_url, out_file, out_format='xml',
    maxtasks=10, exclude_urls=[], verifyssl=True,
    headers=None, timezone_offset=0, changefreq=None,
    priorities=None):
    """
    run crowler
    :param root_url: Site root url
    :param out_file: path to the out file
    :param out_format: format of out file [xml, txt]
    :param maxtasks: max count of tasks
    :param exclude_urls: excludable url paths
    :param verifyssl: verify website certificate?
    :param headers: Send these headers in every request
    :param timezone_offset: timezone offset for lastmod tags
    :param changefreq: dictionary, where key is site sub url regex, and value is changefreq
    :param priorities: dictionary, where key is site sub url regex, and value is priority float
    :return:
    """
    loop = asyncio.get_event_loop()

    c = Crawler(root_url, out_file=out_file, out_format=out_format, maxtasks=maxtasks)
    c = Crawler(root_url, out_file=out_file, out_format=out_format,
                maxtasks=maxtasks, exclude_urls=exclude_urls, verifyssl=verifyssl,
                headers=headers, timezone_offset=timezone_offset,
                changefreq=changefreq, priorities=priorities)

    loop.run_until_complete(c.run())

    try:
@ -23,5 +37,5 @@ def crawler(root_url, out_file, out_format='xml', maxtasks=100):
        pass
    print('todo_queue:', len(c.todo_queue))
    print('busy:', len(c.busy))
    print('done:', len(c.done), '; ok:', sum(c.done.values()))
    print('tasks:', len(c.tasks))
    print('done:', len(c.done), '; ok:', sum(list(zip(*c.done.values()))[0]) )
    print('tasks:', len(c.tasks))
--- a/pysitemap/base_crawler.py
+++ b/pysitemap/base_crawler.py
@ -1,4 +1,4 @@
 import logging
 kimport logging
 import asyncio
 import re
 import urllib.parse
@ -14,7 +14,8 @@ class Crawler:
        'txt': TextWriter
    }

    def __init__(self, rooturl, out_file, out_format='xml', maxtasks=100,
    def __init__(self, rooturl, out_file, out_format='xml', maxtasks=10, exclude_urls=[], verifyssl=True,
                 headers=None, timezone_offset=0, changefreq=None, priorities=None,
                 todo_queue_backend=set, done_backend=dict):
        """
        Crawler constructor
@ -24,18 +25,35 @@ class Crawler:
        :type out_file: str
        :param out_format: sitemap type [xml | txt]. Default xml
        :type out_format: str
        :param maxtasks: maximum count of tasks. Default 100
        :param maxtasks: maximum count of tasks. Default 10
        :type maxtasks: int
        :param exclude_urls: excludable url paths relative to root url
        :type exclude_urls: list
        :param verifyssl: verify website certificate?
        :type verifyssl: boolean
        :param timezone_offset: timezone offset for lastmod tags
        :type timezone_offset: int
        :param changefreq: dictionary, where key is site sub url regex, and value is changefreq
        :type changefreq: dict
        :param priorities: dictionary, where key is site sub url regex, and value is priority float
        :type priorities: dict
        """
        self.rooturl = rooturl
        self.exclude_urls = exclude_urls
        self.todo_queue = todo_queue_backend()
        self.busy = set()
        self.done = done_backend()
        self.tasks = set()
        self.sem = asyncio.Semaphore(maxtasks)
        self.timezone_offset = timezone_offset
        self.changefreq = changefreq
        self.priorities = priorities

        # connector stores cookies between requests and uses connection pool
        self.session = aiohttp.ClientSession()
        self.session = aiohttp.ClientSession(
            headers=headers,
            connector=aiohttp.TCPConnector(verify_ssl=verifyssl)
        )
        self.writer = self.format_processors.get(out_format)(out_file)

    async def run(self):
@ -50,7 +68,29 @@ class Crawler:

        await t
        await self.session.close()
        await self.writer.write([key for key, value in self.done.items() if value])
        await self.writer.write([(key, value) for key, value in self.done.items() if key and value], self.timezone_offset)

    async def contains(self, url, regex, rlist=True):
        """
        Does url path matches a value in regex_list?
        """
        retvalue = False
        if rlist:
            for exc in regex:
                retvalue = bool(re.search(re.compile(r"{}".format(exc)), url))
                if retvalue: return retvalue
        else:
            retvalue = bool(re.search(re.compile(r"{}".format(regex)), url))
        return retvalue

    async def urldict(self, url, url_dict):
        """
        Parse URL regex (key) and value pairs
        """
        for urlkey, regvalue in url_dict.items():
            if await self.contains(url, urlkey, rlist=False):
                return regvalue
        return None

    async def addurls(self, urls):
        """
@ -61,7 +101,9 @@ class Crawler:
        for url, parenturl in urls:
            url = urllib.parse.urljoin(parenturl, url)
            url, frag = urllib.parse.urldefrag(url)

            if (url.startswith(self.rooturl) and
                    not await self.contains(url, self.exclude_urls, rlist=True) and
                    url not in self.busy and
                    url not in self.done and
                    url not in self.todo_queue):
@ -89,26 +131,37 @@ class Crawler:
        self.todo_queue.remove(url)
        self.busy.add(url)

        lastmod = None
        cf = None
        pr = None

        try:
            resp = await self.session.get(url)  # await response
        except Exception as exc:
            # on any exception mark url as BAD
            print('...', url, 'has error', repr(str(exc)))
            self.done[url] = False
            self.done[url] = [False, lastmod, cf, pr]
        else:
            # only url with status == 200 and content type == 'text/html' parsed
            if (resp.status == 200 and
                    ('text/html' in resp.headers.get('content-type'))):
                data = (await resp.read()).decode('utf-8', 'replace')
                urls = re.findall(r'(?i)href=["\']?([^\s"\'<>]+)', data)
                lastmod = resp.headers.get('last-modified')

                asyncio.Task(self.addurls([(u, url) for u in urls]))

                try: pr = await self.urldict(url, self.changefreq)
                except IndexError: pass

                try: cf = await self.urldict(url, self.priorities)
                except IndexError: pass

            # even if we have no exception, we can mark url as good
            resp.close()
            self.done[url] = True

            self.done[url] = [True, lastmod, cf, pr]

        self.busy.remove(url)
        logging.info(len(self.done), 'completed tasks,', len(self.tasks),
              'still pending, todo_queue', len(self.todo_queue))


--- a/pysitemap/format_processors/xml.py
+++ b/pysitemap/format_processors/xml.py
@ -1,14 +1,14 @@
 import asyncio
 from aiofile import AIOFile, Reader, Writer
 import logging

 from datetime import datetime, timezone, timedelta

 class XMLWriter():
    def __init__(self, filename: str):
        self.filename = filename


    async def write(self, urls):
    async def write(self, urls, timezone_offset):
        async with AIOFile(self.filename, 'w') as aiodf:
            writer = Writer(aiodf)
            await writer('<?xml version="1.0" encoding="utf-8"?>\n')
@ -17,10 +17,26 @@ class XMLWriter():
                ' xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"'
                ' xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd">\n')
            await aiodf.fsync()
            for url in urls:
                await writer('<url><loc>{}</loc></url>\n'.format(url))
            for data in urls:

                timestamp  = data[1][1]
                changefreq = data[1][2]
                priority   = data[1][3]
                url = "<loc>{}</loc>".format(data[0])

                if timestamp is not None:
                    timestamp = datetime.strptime(timestamp, "%a, %d %b %Y %H:%M:%S %Z").astimezone(tz=timezone(timedelta(hours=timezone_offset))).isoformat()
                    url += "<lastmod>{}</lastmod>".format(str(timestamp))

                if changefreq is not None:
                    url += "<changefreq>{}</changefreq>".format(str(changefreq))

                if priority is not None:
                    url += "<priority>{}</priority>".format(str(priority))

                await writer('<url>{}</url>\n'.format(url))

            await aiodf.fsync()

            await writer('</urlset>')
            await aiodf.fsync()