|
@ -3,25 +3,53 @@ |
|
|
import time |
|
|
import time |
|
|
import asyncio |
|
|
import asyncio |
|
|
from aiohttp import ClientSession |
|
|
from aiohttp import ClientSession |
|
|
|
|
|
from parsel import Selector |
|
|
|
|
|
from urllib.parse import urlparse, urlunparse |
|
|
|
|
|
|
|
|
class Knocker(object): |
|
|
|
|
|
def __init__(self, urls=None, sleep_time=.5): |
|
|
|
|
|
self.urls = urls or [] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Crawler(object): |
|
|
|
|
|
def __init__(self, url, sleep_time=.5): |
|
|
|
|
|
|
|
|
|
|
|
self.urls = [url] |
|
|
|
|
|
scheme, netloc, path, params, query, fragment = urlparse(url) |
|
|
|
|
|
if not netloc: |
|
|
|
|
|
netloc, path = path, netloc |
|
|
|
|
|
url = urlunparse((scheme, netloc, "", params, "", fragment)) |
|
|
|
|
|
self.base_url = url |
|
|
self.sleep_time = float(sleep_time) |
|
|
self.sleep_time = float(sleep_time) |
|
|
|
|
|
|
|
|
async def fetch(self, url, session): |
|
|
async def fetch(self, url, session): |
|
|
async with session.get(url) as response: |
|
|
async with session.get(url) as response: |
|
|
await asyncio.sleep(self.sleep_time) |
|
|
await asyncio.sleep(self.sleep_time) |
|
|
status = response.status |
|
|
|
|
|
date = response.headers.get("DATE") |
|
|
|
|
|
print("{}:{} with status {}".format(date, response.url, status)) |
|
|
|
|
|
return url, status |
|
|
|
|
|
|
|
|
return response.content |
|
|
|
|
|
|
|
|
async def bound_fetch(self, sem, url, session): |
|
|
async def bound_fetch(self, sem, url, session): |
|
|
# Getter function with semaphore. |
|
|
# Getter function with semaphore. |
|
|
async with sem: |
|
|
async with sem: |
|
|
await self.fetch(url, session) |
|
|
await self.fetch(url, session) |
|
|
|
|
|
|
|
|
|
|
|
def norm_link(self, url:str): |
|
|
|
|
|
if url.startswith(self.base_url): |
|
|
|
|
|
return url |
|
|
|
|
|
elif url.startswith('//'): |
|
|
|
|
|
return "{scheme}{url}".format( |
|
|
|
|
|
scheme=self.base_url[:self.base_url.find(":")], |
|
|
|
|
|
url=url |
|
|
|
|
|
) |
|
|
|
|
|
elif url.startswith("/"): |
|
|
|
|
|
return self.base_url + url |
|
|
|
|
|
return None |
|
|
|
|
|
|
|
|
|
|
|
async def parse(self, content): |
|
|
|
|
|
sel = Selector(content) |
|
|
|
|
|
links = sel.xpath('//a/@href').getall() |
|
|
|
|
|
normalized_links = [] |
|
|
|
|
|
for link in links: |
|
|
|
|
|
link = self.norm_link(link) |
|
|
|
|
|
if link: |
|
|
|
|
|
normalized_links.append(link) |
|
|
|
|
|
self.urls.extend(normalized_links) |
|
|
|
|
|
|
|
|
async def run(self): |
|
|
async def run(self): |
|
|
tasks = [] |
|
|
tasks = [] |
|
|
# create instance of Semaphore |
|
|
# create instance of Semaphore |
|
|