From 75c4770b127c1d357540d343f65eeb3389bb7dd3 Mon Sep 17 00:00:00 2001 From: Kamo Petrosyan Date: Wed, 19 Feb 2020 01:14:17 +0300 Subject: [PATCH] docstring and comments for code --- .gitignore | 1 + README.md | 6 ++++++ pysitemap/base_crawler.py | 40 +++++++++++++++++++++++++++++++++++++-- run.py | 4 ++-- setup.py | 1 - 5 files changed, 47 insertions(+), 5 deletions(-) diff --git a/.gitignore b/.gitignore index 73c4a0c..9eb4d3b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ .idea/ tests/ errors.txt +.pypirc # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/README.md b/README.md index 28bb622..17fa079 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,12 @@ Sitemap generator ## installing pip install sitemap-generator + +## requirements + + asyncio + aiofile + aiohttp ## example diff --git a/pysitemap/base_crawler.py b/pysitemap/base_crawler.py index c6869a9..30ab28f 100644 --- a/pysitemap/base_crawler.py +++ b/pysitemap/base_crawler.py @@ -1,3 +1,4 @@ +import logging import asyncio import re import urllib.parse @@ -14,6 +15,17 @@ class Crawler: } def __init__(self, rooturl, out_file, out_format='xml', maxtasks=100): + """ + Crawler constructor + :param rooturl: root url of site + :type rooturl: str + :param out_file: file to save sitemap result + :type out_file: str + :param out_format: sitemap type [xml | txt]. Default xml + :type out_format: str + :param maxtasks: maximum count of tasks. Default 100 + :type maxtasks: int + """ self.rooturl = rooturl self.todo = set() self.busy = set() @@ -26,6 +38,10 @@ class Crawler: self.writer = self.format_processors.get(out_format)(out_file) async def run(self): + """ + Main function to start parsing site + :return: + """ t = asyncio.ensure_future(self.addurls([(self.rooturl, '')])) await asyncio.sleep(1) while self.busy: @@ -36,6 +52,11 @@ class Crawler: await self.writer.write([key for key, value in self.done.items() if value]) async def addurls(self, urls): + """ + Add urls in queue and run process to parse + :param urls: + :return: + """ for url, parenturl in urls: url = urllib.parse.urljoin(parenturl, url) url, frag = urllib.parse.urldefrag(url) @@ -44,34 +65,49 @@ class Crawler: url not in self.done and url not in self.todo): self.todo.add(url) + # Acquire semaphore await self.sem.acquire() + # Create async task task = asyncio.ensure_future(self.process(url)) + # Add collback into task to release semaphore task.add_done_callback(lambda t: self.sem.release()) + # Callback to remove task from tasks task.add_done_callback(self.tasks.remove) + # Add task into tasks self.tasks.add(task) async def process(self, url): + """ + Process single url + :param url: + :return: + """ print('processing:', url) + # remove url from basic queue and add it into busy list self.todo.remove(url) self.busy.add(url) + try: - resp = await self.session.get(url) + resp = await self.session.get(url) # await response except Exception as exc: + # on any exception mark url as BAD print('...', url, 'has error', repr(str(exc))) self.done[url] = False else: + # only url with status == 200 and content type == 'text/html' parsed if (resp.status == 200 and ('text/html' in resp.headers.get('content-type'))): data = (await resp.read()).decode('utf-8', 'replace') urls = re.findall(r'(?i)href=["\']?([^\s"\'<>]+)', data) asyncio.Task(self.addurls([(u, url) for u in urls])) + # even if we have no exception, we can mark url as good resp.close() self.done[url] = True self.busy.remove(url) - print(len(self.done), 'completed tasks,', len(self.tasks), + logging.info(len(self.done), 'completed tasks,', len(self.tasks), 'still pending, todo', len(self.todo)) diff --git a/run.py b/run.py index a621788..4ecb09d 100644 --- a/run.py +++ b/run.py @@ -11,5 +11,5 @@ if __name__ == '__main__': events.set_event_loop(el) # root_url = sys.argv[1] - root_url = 'https://www.haikson.com' - crawler(root_url, out_file='sitemap.xml') \ No newline at end of file + root_url = 'http://www.techelec.ru' + crawler(root_url, out_file='techelec.xml') \ No newline at end of file diff --git a/setup.py b/setup.py index 2cef56c..6457309 100644 --- a/setup.py +++ b/setup.py @@ -1,4 +1,3 @@ -from distutils.core import setup from setuptools import find_packages, setup EXCLUDE_FROM_PACKAGES = ['tests',]