docstring and comments for code

5 years ago · 75c4770b12
--- a/.gitignore
+++ b/.gitignore
@ -1,6 +1,7 @@
 .idea/
 tests/
 errors.txt
 .pypirc

 # Byte-compiled / optimized / DLL files
 __pycache__/
--- a/README.md
+++ b/README.md
@ -4,6 +4,12 @@ Sitemap generator
 ## installing

    pip install sitemap-generator
    
 ## requirements

    asyncio
    aiofile
    aiohttp

 ## example

--- a/pysitemap/base_crawler.py
+++ b/pysitemap/base_crawler.py
@ -1,3 +1,4 @@
 import logging
 import asyncio
 import re
 import urllib.parse
@ -14,6 +15,17 @@ class Crawler:
    }

    def __init__(self, rooturl, out_file, out_format='xml', maxtasks=100):
        """
        Crawler constructor
        :param rooturl: root url of site
        :type rooturl: str
        :param out_file: file to save sitemap result
        :type out_file: str
        :param out_format: sitemap type [xml | txt]. Default xml
        :type out_format: str
        :param maxtasks: maximum count of tasks. Default 100
        :type maxtasks: int
        """
        self.rooturl = rooturl
        self.todo = set()
        self.busy = set()
@ -26,6 +38,10 @@ class Crawler:
        self.writer = self.format_processors.get(out_format)(out_file)

    async def run(self):
        """
        Main function to start parsing site
        :return:
        """
        t = asyncio.ensure_future(self.addurls([(self.rooturl, '')]))
        await asyncio.sleep(1)
        while self.busy:
@ -36,6 +52,11 @@ class Crawler:
        await self.writer.write([key for key, value in self.done.items() if value])

    async def addurls(self, urls):
        """
        Add urls in queue and run process to parse
        :param urls:
        :return:
        """
        for url, parenturl in urls:
            url = urllib.parse.urljoin(parenturl, url)
            url, frag = urllib.parse.urldefrag(url)
@ -44,34 +65,49 @@ class Crawler:
                    url not in self.done and
                    url not in self.todo):
                self.todo.add(url)
                # Acquire semaphore
                await self.sem.acquire()
                # Create async task
                task = asyncio.ensure_future(self.process(url))
                # Add collback into task to release semaphore
                task.add_done_callback(lambda t: self.sem.release())
                # Callback to remove task from tasks
                task.add_done_callback(self.tasks.remove)
                # Add task into tasks
                self.tasks.add(task)

    async def process(self, url):
        """
        Process single url
        :param url:
        :return:
        """
        print('processing:', url)

        # remove url from basic queue and add it into busy list
        self.todo.remove(url)
        self.busy.add(url)

        try:
            resp = await self.session.get(url)
            resp = await self.session.get(url)  # await response
        except Exception as exc:
            # on any exception mark url as BAD
            print('...', url, 'has error', repr(str(exc)))
            self.done[url] = False
        else:
            # only url with status == 200 and content type == 'text/html' parsed
            if (resp.status == 200 and
                    ('text/html' in resp.headers.get('content-type'))):
                data = (await resp.read()).decode('utf-8', 'replace')
                urls = re.findall(r'(?i)href=["\']?([^\s"\'<>]+)', data)
                asyncio.Task(self.addurls([(u, url) for u in urls]))

            # even if we have no exception, we can mark url as good
            resp.close()
            self.done[url] = True

        self.busy.remove(url)
        print(len(self.done), 'completed tasks,', len(self.tasks),
        logging.info(len(self.done), 'completed tasks,', len(self.tasks),
              'still pending, todo', len(self.todo))


--- a/run.py
+++ b/run.py
@ -11,5 +11,5 @@ if __name__ == '__main__':
        events.set_event_loop(el)

    # root_url = sys.argv[1]
    root_url = 'https://www.haikson.com'
    crawler(root_url, out_file='sitemap.xml')
    root_url = 'http://www.techelec.ru'
    crawler(root_url, out_file='techelec.xml')
--- a/setup.py
+++ b/setup.py
@ -1,4 +1,3 @@
 from distutils.core import setup
 from setuptools import find_packages, setup

 EXCLUDE_FROM_PACKAGES = ['tests',]