Fincer
/
sitemap-generator
mirror of https://github.com/Fincer/sitemap-generator

#!/usr/bin/env python3
import timeimport asynciofrom aiohttp import ClientSessionfrom parsel import Selectorfrom urllib.parse import urlparse, urlunparse

class Crawler(object):    def __init__(self, url, sleep_time=.5):
        self.urls = [url]        scheme, netloc, path, params, query, fragment = urlparse(url)        if not netloc:            netloc, path = path, netloc        url = urlunparse((scheme, netloc, "", params, "", fragment))        self.base_url = url        self.sleep_time = float(sleep_time)
    async def fetch(self, url, session):        async with session.get(url) as response:            await asyncio.sleep(self.sleep_time)            return response.content
    async def bound_fetch(self, sem, url, session):        # Getter function with semaphore.        async with sem:            await self.fetch(url, session)
    def norm_link(self, url:str):        if url.startswith(self.base_url):            return url        elif url.startswith('//'):            return "{scheme}{url}".format(                scheme=self.base_url[:self.base_url.find(":")],                url=url            )        elif url.startswith("/"):            return self.base_url + url        return None
    async def parse(self, content):        sel = Selector(content)        links = sel.xpath('//a/@href').getall()        normalized_links = []        for link in links:            link = self.norm_link(link)            if link:                normalized_links.append(link)        self.urls.extend(normalized_links)
    async def run(self):        tasks = []        # create instance of Semaphore        sem = asyncio.Semaphore(20)
        # Create client session that will ensure we dont open new connection        # per each request.        async with ClientSession() as session:            for url in self.urls:                # pass Semaphore and session to every GET request                task = asyncio.ensure_future(self.bound_fetch(sem, url, session))                tasks.append(task)
            responses = asyncio.gather(*tasks)            await responses