Fincer
/
sitemap-generator
mirror of https://github.com/Fincer/sitemap-generator

import loggingimport asyncioimport reimport urllib.parsefrom pysitemap.format_processors.xml import XMLWriterfrom pysitemap.format_processors.text import TextWriterimport aiohttp

class Crawler:
    format_processors = {        'xml': XMLWriter,        'txt': TextWriter    }
    def __init__(self, rooturl, out_file, out_format='xml', maxtasks=100):        """
        Crawler constructor        :param rooturl: root url of site        :type rooturl: str        :param out_file: file to save sitemap result        :type out_file: str        :param out_format: sitemap type [xml | txt]. Default xml        :type out_format: str        :param maxtasks: maximum count of tasks. Default 100        :type maxtasks: int        """
        self.rooturl = rooturl        self.todo = set()        self.busy = set()        self.done = {}        self.tasks = set()        self.sem = asyncio.Semaphore(maxtasks)
        # connector stores cookies between requests and uses connection pool        self.session = aiohttp.ClientSession()        self.writer = self.format_processors.get(out_format)(out_file)
    async def run(self):        """
        Main function to start parsing site        :return:        """
        t = asyncio.ensure_future(self.addurls([(self.rooturl, '')]))        await asyncio.sleep(1)        while self.busy:            await asyncio.sleep(1)
        await t        await self.session.close()        await self.writer.write([key for key, value in self.done.items() if value])
    async def addurls(self, urls):        """
        Add urls in queue and run process to parse        :param urls:        :return:        """
        for url, parenturl in urls:            url = urllib.parse.urljoin(parenturl, url)            url, frag = urllib.parse.urldefrag(url)            if (url.startswith(self.rooturl) and                    url not in self.busy and                    url not in self.done and                    url not in self.todo):                self.todo.add(url)                # Acquire semaphore                await self.sem.acquire()                # Create async task                task = asyncio.ensure_future(self.process(url))                # Add collback into task to release semaphore                task.add_done_callback(lambda t: self.sem.release())                # Callback to remove task from tasks                task.add_done_callback(self.tasks.remove)                # Add task into tasks                self.tasks.add(task)
    async def process(self, url):        """
        Process single url        :param url:        :return:        """
        print('processing:', url)
        # remove url from basic queue and add it into busy list        self.todo.remove(url)        self.busy.add(url)
        try:            resp = await self.session.get(url)  # await response        except Exception as exc:            # on any exception mark url as BAD            print('...', url, 'has error', repr(str(exc)))            self.done[url] = False        else:            # only url with status == 200 and content type == 'text/html' parsed            if (resp.status == 200 and                    ('text/html' in resp.headers.get('content-type'))):                data = (await resp.read()).decode('utf-8', 'replace')                urls = re.findall(r'(?i)href=["\']?([^\s"\'<>]+)', data)                asyncio.Task(self.addurls([(u, url) for u in urls]))
            # even if we have no exception, we can mark url as good            resp.close()            self.done[url] = True
        self.busy.remove(url)        logging.info(len(self.done), 'completed tasks,', len(self.tasks),              'still pending, todo', len(self.todo))