|
|
- #!/usr/bin/env python3
-
- import time
- import asyncio
- from aiohttp import ClientSession
- from parsel import Selector
- from urllib.parse import urlparse, urlunparse
-
-
- class Crawler(object):
- def __init__(self, url, sleep_time=.5):
-
- self.urls = [url]
- scheme, netloc, path, params, query, fragment = urlparse(url)
- if not netloc:
- netloc, path = path, netloc
- url = urlunparse((scheme, netloc, "", params, "", fragment))
- self.base_url = url
- self.sleep_time = float(sleep_time)
-
- async def fetch(self, url, session):
- async with session.get(url) as response:
- await asyncio.sleep(self.sleep_time)
- return response.content
-
- async def bound_fetch(self, sem, url, session):
- # Getter function with semaphore.
- async with sem:
- await self.fetch(url, session)
-
- def norm_link(self, url:str):
- if url.startswith(self.base_url):
- return url
- elif url.startswith('//'):
- return "{scheme}{url}".format(
- scheme=self.base_url[:self.base_url.find(":")],
- url=url
- )
- elif url.startswith("/"):
- return self.base_url + url
- return None
-
- async def parse(self, content):
- sel = Selector(content)
- links = sel.xpath('//a/@href').getall()
- normalized_links = []
- for link in links:
- link = self.norm_link(link)
- if link:
- normalized_links.append(link)
- self.urls.extend(normalized_links)
-
- async def run(self):
- tasks = []
- # create instance of Semaphore
- sem = asyncio.Semaphore(20)
-
- # Create client session that will ensure we dont open new connection
- # per each request.
- async with ClientSession() as session:
- for url in self.urls:
- # pass Semaphore and session to every GET request
- task = asyncio.ensure_future(self.bound_fetch(sem, url, session))
- tasks.append(task)
-
- responses = asyncio.gather(*tasks)
- await responses
-
|