|
|
@ -1,3 +1,4 @@ |
|
|
|
import logging |
|
|
|
import asyncio |
|
|
|
import re |
|
|
|
import urllib.parse |
|
|
@ -14,6 +15,17 @@ class Crawler: |
|
|
|
} |
|
|
|
|
|
|
|
def __init__(self, rooturl, out_file, out_format='xml', maxtasks=100): |
|
|
|
""" |
|
|
|
Crawler constructor |
|
|
|
:param rooturl: root url of site |
|
|
|
:type rooturl: str |
|
|
|
:param out_file: file to save sitemap result |
|
|
|
:type out_file: str |
|
|
|
:param out_format: sitemap type [xml | txt]. Default xml |
|
|
|
:type out_format: str |
|
|
|
:param maxtasks: maximum count of tasks. Default 100 |
|
|
|
:type maxtasks: int |
|
|
|
""" |
|
|
|
self.rooturl = rooturl |
|
|
|
self.todo = set() |
|
|
|
self.busy = set() |
|
|
@ -26,6 +38,10 @@ class Crawler: |
|
|
|
self.writer = self.format_processors.get(out_format)(out_file) |
|
|
|
|
|
|
|
async def run(self): |
|
|
|
""" |
|
|
|
Main function to start parsing site |
|
|
|
:return: |
|
|
|
""" |
|
|
|
t = asyncio.ensure_future(self.addurls([(self.rooturl, '')])) |
|
|
|
await asyncio.sleep(1) |
|
|
|
while self.busy: |
|
|
@ -36,6 +52,11 @@ class Crawler: |
|
|
|
await self.writer.write([key for key, value in self.done.items() if value]) |
|
|
|
|
|
|
|
async def addurls(self, urls): |
|
|
|
""" |
|
|
|
Add urls in queue and run process to parse |
|
|
|
:param urls: |
|
|
|
:return: |
|
|
|
""" |
|
|
|
for url, parenturl in urls: |
|
|
|
url = urllib.parse.urljoin(parenturl, url) |
|
|
|
url, frag = urllib.parse.urldefrag(url) |
|
|
@ -44,34 +65,49 @@ class Crawler: |
|
|
|
url not in self.done and |
|
|
|
url not in self.todo): |
|
|
|
self.todo.add(url) |
|
|
|
# Acquire semaphore |
|
|
|
await self.sem.acquire() |
|
|
|
# Create async task |
|
|
|
task = asyncio.ensure_future(self.process(url)) |
|
|
|
# Add collback into task to release semaphore |
|
|
|
task.add_done_callback(lambda t: self.sem.release()) |
|
|
|
# Callback to remove task from tasks |
|
|
|
task.add_done_callback(self.tasks.remove) |
|
|
|
# Add task into tasks |
|
|
|
self.tasks.add(task) |
|
|
|
|
|
|
|
async def process(self, url): |
|
|
|
""" |
|
|
|
Process single url |
|
|
|
:param url: |
|
|
|
:return: |
|
|
|
""" |
|
|
|
print('processing:', url) |
|
|
|
|
|
|
|
# remove url from basic queue and add it into busy list |
|
|
|
self.todo.remove(url) |
|
|
|
self.busy.add(url) |
|
|
|
|
|
|
|
try: |
|
|
|
resp = await self.session.get(url) |
|
|
|
resp = await self.session.get(url) # await response |
|
|
|
except Exception as exc: |
|
|
|
# on any exception mark url as BAD |
|
|
|
print('...', url, 'has error', repr(str(exc))) |
|
|
|
self.done[url] = False |
|
|
|
else: |
|
|
|
# only url with status == 200 and content type == 'text/html' parsed |
|
|
|
if (resp.status == 200 and |
|
|
|
('text/html' in resp.headers.get('content-type'))): |
|
|
|
data = (await resp.read()).decode('utf-8', 'replace') |
|
|
|
urls = re.findall(r'(?i)href=["\']?([^\s"\'<>]+)', data) |
|
|
|
asyncio.Task(self.addurls([(u, url) for u in urls])) |
|
|
|
|
|
|
|
# even if we have no exception, we can mark url as good |
|
|
|
resp.close() |
|
|
|
self.done[url] = True |
|
|
|
|
|
|
|
self.busy.remove(url) |
|
|
|
print(len(self.done), 'completed tasks,', len(self.tasks), |
|
|
|
logging.info(len(self.done), 'completed tasks,', len(self.tasks), |
|
|
|
'still pending, todo', len(self.todo)) |
|
|
|
|
|
|
|
|