Fincer
/
sitemap-generator
mirror of https://github.com/Fincer/sitemap-generator

import loggingimport asyncioimport reimport urllib.parsefrom pysitemap.format_processors.xml import XMLWriterfrom pysitemap.format_processors.text import TextWriterimport aiohttp

class Crawler:
    format_processors = {        'xml': XMLWriter,        'txt': TextWriter    }
    def __init__(self, rooturl, out_file, out_format='xml', maxtasks=10, exclude_urls=[], exclude_imgs=[],                 image_root_urls=[], use_lastmodified=True, verifyssl=True, findimages=True, images_this_domain=True,                 headers=None, timezone_offset=0, changefreq=None, priorities=None, todo_queue_backend=set,                 done_backend=dict, done_images=list):        """
        Crawler constructor        :param rooturl: root url of site        :type rooturl: str        :param out_file: file to save sitemap result        :type out_file: str        :param out_format: sitemap type [xml | txt]. Default xml        :type out_format: str        :param maxtasks: maximum count of tasks. Default 10        :type maxtasks: int        :param exclude_urls: excludable url paths relative to root url        :type exclude_urls: list        :param exclude_imgs: excludable img url paths relative to root url        :type exclude_imgs: list        :param image_root_urls: recognized image root urls on the domain        :type image_root_urls: list        :param use_lastmodified: enable or disable timestamps for fetched urls?        :type use_lastmodified: bool        :param verifyssl: verify website certificate?        :type verifyssl: bool        :param findimages: Find images references?        :type findimages: bool        :param images_this_domain: Find images which refer to this domain only?        :type images_this_domain: bool        :param timezone_offset: timezone offset for lastmod tags        :type timezone_offset: int        :param changefreq: dictionary, where key is site sub url regex, and value is changefreq        :type changefreq: dict        :param priorities: dictionary, where key is site sub url regex, and value is priority float        :type priorities: dict        """
        self.rooturl = rooturl        self.exclude_urls = exclude_urls        self.exclude_imgs = exclude_imgs        self.use_lastmodified = use_lastmodified        self.image_root_urls = image_root_urls        self.findimages = findimages        self.images_this_domain = images_this_domain        self.todo_queue = todo_queue_backend()        self.busy = set()        self.done = done_backend()        self.done_images = done_images()        self.tasks = set()        self.sem = asyncio.Semaphore(maxtasks)        self.timezone_offset = timezone_offset        self.changefreq = changefreq        self.priorities = priorities
        # connector stores cookies between requests and uses connection pool        self.session = aiohttp.ClientSession(            headers=headers,            connector=aiohttp.TCPConnector(verify_ssl=verifyssl)        )        self.writer = self.format_processors.get(out_format)(out_file)
    async def run(self):        """
        Main function to start parsing site        :return:        """
        t = asyncio.ensure_future(self.addurls([(self.rooturl, '')]))        await asyncio.sleep(1)        while self.busy:            await asyncio.sleep(1)
        await t        await self.session.close()        await self.writer.write([(key, value) for key, value in self.done.items() if key and value], self.timezone_offset)
    async def contains(self, url, regex, rlist=True):        """
        Does url path matches a value in regex_list?        """
        retvalue = False        if rlist:            for exc in regex:                retvalue = bool(re.search(re.compile(r"{}".format(exc)), url))                if retvalue: return retvalue        else:            retvalue = bool(re.search(re.compile(r"{}".format(regex)), url))        return retvalue
    async def urldict(self, url, url_dict):        """
        Parse URL regex (key) and value pairs        """
        for urlkey, regvalue in url_dict.items():            if await self.contains(url, urlkey, rlist=False):                return regvalue        return None
    async def addurls(self, urls):        """
        Add urls in queue and run process to parse        :param urls:        :return:        """
        for url, parenturl in urls:            url = urllib.parse.urljoin(parenturl, url)            url, frag = urllib.parse.urldefrag(url)
            if (url.startswith(self.rooturl) and                    not await self.contains(url, self.exclude_urls, rlist=True) and                    url not in self.busy and                    url not in self.done and                    url not in self.todo_queue):                self.todo_queue.add(url)                # Acquire semaphore                await self.sem.acquire()                # Create async task                task = asyncio.ensure_future(self.process(url))                # Add collback into task to release semaphore                task.add_done_callback(lambda t: self.sem.release())                # Callback to remove task from tasks                task.add_done_callback(self.tasks.remove)                # Add task into tasks                self.tasks.add(task)
    async def mimechecker(self, url, expected):        """
        Check url resource mimetype        """

        self.todo_queue.remove(url)        self.busy.add(url)
        try:            resp = await self.session.get(url)        except Exception as exc:            pass        else:            mime = resp.headers.get('content-type')            if (resp.status == 200 and                bool(re.search(re.compile(r'{}'.format(expected)), mime))):                resp.close()                self.busy.remove(url)                return True        resp.close()        self.busy.remove(url)        return False
    async def fetchtags(self, data, url, tag_input, fields=[]):        """
        Find and sort all target tags from website data        """
        tags = []        lines_join = []        for line in data.split('\n'):            lines_join.append(line)
        tags_raw = re.findall(re.compile(r'<{}.*?>'.format(tag_input)), ' '.join(lines_join))
        for tag_raw in tags_raw:            tag_raw = re.sub(re.compile(r'<{}(.*?)>'.format(tag_input)), '\\1', tag_raw)
            # Regex lookahead + lookbehind            # Find patterns, where pattern start with "<word>=" and ends with " <word>="            # Include the first pattern, which will be used to determine            # value which the pattern holds in it
            # TODO Note: this method is error-prone, since it assumes that...            #  ... no argument value inside <img ... /> tag has value of "<somechar>="            #  If this happens, args regex findall & splitting (below) fails.            args_raw = re.findall(r'(?i)(?=[\w]+[=]|[\w\"\'])(.*?)(?=\s[\w]+[=])', tag_raw)            tag = []            for arg_raw in args_raw:                arg = arg_raw.split('=')                if len(arg) != 2:                    print("warning: failure on tag data parsing operation.")                    continue
                arg_dict = {}                key = arg[0]                # Remove leading and trailing quote marks from value                value = re.sub(r'^["\']?(.*?)["\']?$', '\\1', arg[1])                value = re.sub(r'&', '&amp;', value)                for field in fields:                    if key == field:                        arg_dict[field] = value#                    else:#                        print("warning: ignoring tag data value:", key)
                if len(arg_dict) == 1:                    tags.append(arg_dict)        return tags
    async def addtagdata(self, tagdata, url, source_url_field,                            mimetype, tag_root_urls=[], excludes=[],                            done_list=[], this_domain=True):        """
        Validate existence of url in given tagdata        :return: dictionary of validated tags (of single type)        """
        tags = []        for tag in tagdata:            if not source_url_field in tag:                continue            if not await self.contains(tag[source_url_field], excludes, rlist=True):
                if this_domain:                    if not tag[source_url_field].startswith('http'):                        for tag_root_url in tag_root_urls:                            if url.startswith(tag_root_url):                                tag[source_url_field] = tag_root_url + tag[source_url_field]                                break                else:                    if not tag[source_url_field].startswith('http'):                        continue
                if (tag[source_url_field].startswith('http') and                    tag not in done_list and                    tag[source_url_field] not in self.busy and                    tag[source_url_field] not in self.todo_queue):                    self.todo_queue.add(tag[source_url_field])                    # Acquire semaphore                    await self.sem.acquire()                    # Create async task                    task = asyncio.ensure_future(self.mimechecker(tag[source_url_field], mimetype))                    # Add collback into task to release semaphore                    task.add_done_callback(lambda t: self.sem.release())                    # Callback to remove task from tasks                    task.add_done_callback(self.tasks.remove)                    # Add task into tasks                    self.tasks.add(task)                    try:                        result = await asyncio.wait_for(task, timeout=20)                        if (result):                            tags.append(tag)
                    except asyncio.TimeoutError:                        print("couldn't add tag data:", tag[source_url_field])                        task.cancel()                        pass
        done_list.extend(tags)        return tags
    async def process(self, url):        """
        Process single url        :param url:        :return:        """
        print('processing:', url)
        # remove url from basic queue and add it into busy list        self.todo_queue.remove(url)        self.busy.add(url)
        lastmod = None        cf = None        pr = None        imgs = []
        try:            resp = await self.session.get(url)  # await response        except Exception as exc:            # on any exception mark url as BAD            print('...', url, 'has error', repr(str(exc)))            self.done[url] = [False, lastmod, cf, pr, imgs]        else:            # only url with status == 200 and content type == 'text/html' parsed            if (resp.status == 200 and                    ('text/html' in resp.headers.get('content-type'))):                data = (await resp.read()).decode('utf-8', 'replace')                urls = re.findall(r'(?i)href=["\']?([^\s"\'<>]+)', data)
                if self.use_lastmodified:                    lastmod = resp.headers.get('last-modified')
                if self.findimages:                    # Ref: https://support.google.com/webmasters/answer/178636?hl=en                    img_data = await self.fetchtags(                                data, url, 'img',                                fields=['src', 'title', 'caption', 'geo_location', 'license']                    )                    imgs = await self.addtagdata(                            tagdata=img_data, url=url,                            source_url_field='src', mimetype='^image\/',                            tag_root_urls=self.image_root_urls,                            excludes=self.exclude_imgs,                            done_list=self.done_images,                            this_domain=self.images_this_domain                    )
                asyncio.Task(self.addurls([(u, url) for u in urls]))
                try: pr = await self.urldict(url, self.changefreq)                except IndexError: pass
                try: cf = await self.urldict(url, self.priorities)                except IndexError: pass
            # even if we have no exception, we can mark url as good            resp.close()
            self.done[url] = [True, lastmod, cf, pr, imgs]
        self.busy.remove(url)        logging.info(len(self.done), 'completed tasks,', len(self.tasks),              'still pending, todo_queue', len(self.todo_queue))