Fincer
/
sitemap-generator
mirror of https://github.com/Fincer/sitemap-generator


import logging

import asyncio

import re

import urllib.parse

from pysitemap.format_processors.xml import XMLWriter

from pysitemap.format_processors.text import TextWriter

import aiohttp


class Crawler:


    format_processors = {

        'xml': XMLWriter,

        'txt': TextWriter

    }


    def __init__(self, rooturl, out_file, out_format='xml', maxtasks=10, exclude_urls=[], exclude_imgs=[],

                 image_root_urls=[], verifyssl=True, findimages=True, images_this_domain=True, headers=None, timezone_offset=0,

                 changefreq=None, priorities=None, todo_queue_backend=set, done_backend=dict, done_images=list):

        """

        Crawler constructor

        :param rooturl: root url of site

        :type rooturl: str

        :param out_file: file to save sitemap result

        :type out_file: str

        :param out_format: sitemap type [xml | txt]. Default xml

        :type out_format: str

        :param maxtasks: maximum count of tasks. Default 10

        :type maxtasks: int

        :param exclude_urls: excludable url paths relative to root url

        :type exclude_urls: list

        :param exclude_imgs: excludable img url paths relative to root url

        :type exclude_imgs: list

        :param image_root_urls: recognized image root urls on the domain

        :type image_root_urls: list

        :param verifyssl: verify website certificate?

        :type verifyssl: bool

        :param findimages: Find images references?

        :type findimages: bool

        :param images_this_domain: Find images which refer to this domain only?

        :type images_this_domain: bool

        :param timezone_offset: timezone offset for lastmod tags

        :type timezone_offset: int

        :param changefreq: dictionary, where key is site sub url regex, and value is changefreq

        :type changefreq: dict

        :param priorities: dictionary, where key is site sub url regex, and value is priority float

        :type priorities: dict

        """

        self.rooturl = rooturl

        self.exclude_urls = exclude_urls

        self.exclude_imgs = exclude_imgs

        self.image_root_urls = image_root_urls

        self.findimages = findimages

        self.images_this_domain = images_this_domain

        self.todo_queue = todo_queue_backend()

        self.busy = set()

        self.done = done_backend()

        self.done_images = done_images()

        self.tasks = set()

        self.sem = asyncio.Semaphore(maxtasks)

        self.timezone_offset = timezone_offset

        self.changefreq = changefreq

        self.priorities = priorities


        # connector stores cookies between requests and uses connection pool

        self.session = aiohttp.ClientSession(

            headers=headers,

            connector=aiohttp.TCPConnector(verify_ssl=verifyssl)

        )

        self.writer = self.format_processors.get(out_format)(out_file)


    async def run(self):

        """

        Main function to start parsing site

        :return:

        """

        t = asyncio.ensure_future(self.addurls([(self.rooturl, '')]))

        await asyncio.sleep(1)

        while self.busy:

            await asyncio.sleep(1)


        await t

        await self.session.close()

        await self.writer.write([(key, value) for key, value in self.done.items() if key and value], self.timezone_offset)


    async def contains(self, url, regex, rlist=True):

        """

        Does url path matches a value in regex_list?

        """

        retvalue = False

        if rlist:

            for exc in regex:

                retvalue = bool(re.search(re.compile(r"{}".format(exc)), url))

                if retvalue: return retvalue

        else:

            retvalue = bool(re.search(re.compile(r"{}".format(regex)), url))

        return retvalue


    async def urldict(self, url, url_dict):

        """

        Parse URL regex (key) and value pairs

        """

        for urlkey, regvalue in url_dict.items():

            if await self.contains(url, urlkey, rlist=False):

                return regvalue

        return None


    async def addurls(self, urls):

        """

        Add urls in queue and run process to parse

        :param urls:

        :return:

        """

        for url, parenturl in urls:

            url = urllib.parse.urljoin(parenturl, url)

            url, frag = urllib.parse.urldefrag(url)


            if (url.startswith(self.rooturl) and

                    not await self.contains(url, self.exclude_urls, rlist=True) and

                    url not in self.busy and

                    url not in self.done and

                    url not in self.todo_queue):

                self.todo_queue.add(url)

                # Acquire semaphore

                await self.sem.acquire()

                # Create async task

                task = asyncio.ensure_future(self.process(url))

                # Add collback into task to release semaphore

                task.add_done_callback(lambda t: self.sem.release())

                # Callback to remove task from tasks

                task.add_done_callback(self.tasks.remove)

                # Add task into tasks

                self.tasks.add(task)


    async def mimechecker(self, url, expected):

        """

        Check url resource mimetype

        """


        self.todo_queue.remove(url)

        self.busy.add(url)


        try:

            resp = await self.session.get(url)

        except Exception as exc:

            pass

        else:

            mime = resp.headers.get('content-type')

            if (resp.status == 200 and

                bool(re.search(re.compile(r'{}'.format(expected)), mime))):

                resp.close()

                self.busy.remove(url)

                return True

        resp.close()

        self.busy.remove(url)

        return False


    async def fetchtags(self, data, url, tag_input, fields=[]):

        """

        Find and sort all target tags from website data

        """

        tags = []

        lines_join = []

        for line in data.split('\n'):

            lines_join.append(line)


        tags_raw = re.findall(re.compile(r'<{}.*?>'.format(tag_input)), ' '.join(lines_join))


        for tag_raw in tags_raw:

            tag_raw = re.sub(re.compile(r'<{}(.*?)>'.format(tag_input)), '\\1', tag_raw)


            # Regex lookahead + lookbehind

            # Find patterns, where pattern start with "<word>=" and ends with " <word>="

            # Include the first pattern, which will be used to determine

            # value which the pattern holds in it


            # TODO Note: this method is error-prone, since it assumes that...

            #  ... no argument value inside <img ... /> tag has value of "<somechar>="

            #  If this happens, args regex findall & splitting (below) fails.

            args_raw = re.findall(r'(?i)(?=[\w]+[=]|[\w\"\'])(.*?)(?=\s[\w]+[=])', tag_raw)

            tag = []

            for arg_raw in args_raw:

                arg = arg_raw.split('=')

                if len(arg) != 2:

                    print("warning: failure on tag data parsing operation.")

                    continue


                arg_dict = {}

                key = arg[0]

                # Remove leading and trailing quote marks from value

                value = re.sub(r'^["\']?(.*?)["\']?$', '\\1', arg[1])

                value = re.sub(r'&', '&amp;', value)

                for field in fields:

                    if key == field:

                        arg_dict[field] = value

#                    else:

#                        print("warning: ignoring tag data value:", key)


                if len(arg_dict) == 1:

                    tag.append(arg_dict)

            tags.append(tag)

        return tags


    async def addtagdata(self, tagdata, url, source_url_field,

                            mimetype, tag_root_urls=[], excludes=[],

                            done_list=[], this_domain=True):

        """

        Validate existence of url in given tagdata

        :return: dictionary of validated tags (of single type)

        """

        tags = []

        for data in tagdata:

            for tag in data:

                if not source_url_field in tag:

                    continue

                if not await self.contains(tag[source_url_field], excludes, rlist=True):


                    if this_domain:

                        if not tag[source_url_field].startswith('http'):

                            for tag_root_url in tag_root_urls:

                                if url.startswith(tag_root_url):

                                    tag[source_url_field] = tag_root_url + tag[source_url_field]

                                    break

                    else:

                        if not tag[source_url_field].startswith('http'):

                            continue


                    if (tag[source_url_field].startswith('http') and

                        data not in done_list and

                        tag[source_url_field] not in self.busy and

                        tag[source_url_field] not in self.todo_queue):

                        self.todo_queue.add(tag[source_url_field])

                        # Acquire semaphore

                        await self.sem.acquire()

                        # Create async task

                        task = asyncio.ensure_future(self.mimechecker(tag[source_url_field], mimetype))

                        # Add collback into task to release semaphore

                        task.add_done_callback(lambda t: self.sem.release())

                        # Callback to remove task from tasks

                        task.add_done_callback(self.tasks.remove)

                        # Add task into tasks

                        self.tasks.add(task)

                        try:

                            result = await asyncio.wait_for(task, timeout=20)

                            if (result):

                                tags.append(data)


                        except asyncio.TimeoutError:

                            print("couldn't add tag data:", tag[source_url_field])

                            task.cancel()

                            pass


            done_list.extend(tags)

        return tags


    async def process(self, url):

        """

        Process single url

        :param url:

        :return:

        """

        print('processing:', url)


        # remove url from basic queue and add it into busy list

        self.todo_queue.remove(url)

        self.busy.add(url)


        lastmod = None

        cf = None

        pr = None

        imgs = []


        try:

            resp = await self.session.get(url)  # await response

        except Exception as exc:

            # on any exception mark url as BAD

            print('...', url, 'has error', repr(str(exc)))

            self.done[url] = [False, lastmod, cf, pr, imgs]

        else:

            # only url with status == 200 and content type == 'text/html' parsed

            if (resp.status == 200 and

                    ('text/html' in resp.headers.get('content-type'))):

                data = (await resp.read()).decode('utf-8', 'replace')

                urls = re.findall(r'(?i)href=["\']?([^\s"\'<>]+)', data)


                lastmod = resp.headers.get('last-modified')


                if self.findimages:

                    # Ref: https://support.google.com/webmasters/answer/178636?hl=en

                    img_data = await self.fetchtags(

                                data, url, 'img',

                                fields=['src', 'title', 'caption', 'geo_location', 'license']

                    )

                    imgs = await self.addtagdata(

                            tagdata=img_data, url=url,

                            source_url_field='src', mimetype='^image\/',

                            tag_root_urls=self.image_root_urls,

                            excludes=self.exclude_imgs,

                            done_list=self.done_images,

                            this_domain=self.images_this_domain

                    )


                asyncio.Task(self.addurls([(u, url) for u in urls]))


                try: pr = await self.urldict(url, self.changefreq)

                except IndexError: pass


                try: cf = await self.urldict(url, self.priorities)

                except IndexError: pass


            # even if we have no exception, we can mark url as good

            resp.close()


            self.done[url] = [True, lastmod, cf, pr, imgs]


        self.busy.remove(url)

        logging.info(len(self.done), 'completed tasks,', len(self.tasks),

								              'still pending, todo_queue', len(self.todo_queue))