aiohttp and aiofile

5 years ago · e11e289e5f
--- a/pysitemap/init.py
+++ b/pysitemap/init.py
@ -1 +1,27 @@
 from pysitemap.crawler import Crawler
 import asyncio
 import signal
 from pysitemap.base_crawler import Crawler


 def crawler(root_url, out_file, out_format='xml', maxtasks=100):
    """
    run crowler
    :param root_url: Site root url
    :param out_file: path to the out file
    :param out_format: format of out file [xml, txt]
    :param maxtasks: max count of tasks
    :return:
    """
    loop = asyncio.get_event_loop()

    c = Crawler(root_url, out_file=out_file, out_format=out_format, maxtasks=maxtasks)
    loop.run_until_complete(c.run())

    try:
        loop.add_signal_handler(signal.SIGINT, loop.stop)
    except RuntimeError:
        pass
    print('todo:', len(c.todo))
    print('busy:', len(c.busy))
    print('done:', len(c.done), '; ok:', sum(c.done.values()))
    print('tasks:', len(c.tasks))
--- a/pysitemap/base_crawler.py
+++ b/pysitemap/base_crawler.py
@ -0,0 +1,75 @@
 import asyncio
 import re
 import urllib.parse
 from pysitemap.format_processors.xml import XMLWriter
 import aiohttp


 class Crawler:

    format_processors = {
        'xml': XMLWriter
    }

    def __init__(self, rooturl, out_file, out_format='xml', maxtasks=100):
        self.rooturl = rooturl
        self.todo = set()
        self.busy = set()
        self.done = {}
        self.tasks = set()
        self.sem = asyncio.Semaphore(maxtasks)

        # connector stores cookies between requests and uses connection pool
        self.session = aiohttp.ClientSession()
        self.writer = self.format_processors.get(out_format)(out_file)

    async def run(self):
        t = asyncio.ensure_future(self.addurls([(self.rooturl, '')]))
        await asyncio.sleep(1)
        while self.busy:
            await asyncio.sleep(1)

        await t
        await self.session.close()
        await self.writer.write([key for key, value in self.done.items() if value])

    async def addurls(self, urls):
        for url, parenturl in urls:
            url = urllib.parse.urljoin(parenturl, url)
            url, frag = urllib.parse.urldefrag(url)
            if (url.startswith(self.rooturl) and
                    url not in self.busy and
                    url not in self.done and
                    url not in self.todo):
                self.todo.add(url)
                await self.sem.acquire()
                task = asyncio.ensure_future(self.process(url))
                task.add_done_callback(lambda t: self.sem.release())
                task.add_done_callback(self.tasks.remove)
                self.tasks.add(task)

    async def process(self, url):
        print('processing:', url)

        self.todo.remove(url)
        self.busy.add(url)
        try:
            resp = await self.session.get(url)
        except Exception as exc:
            print('...', url, 'has error', repr(str(exc)))
            self.done[url] = False
        else:
            if (resp.status == 200 and
                    ('text/html' in resp.headers.get('content-type'))):
                data = (await resp.read()).decode('utf-8', 'replace')
                urls = re.findall(r'(?i)href=["\']?([^\s"\'<>]+)', data)
                asyncio.Task(self.addurls([(u, url) for u in urls]))

            resp.close()
            self.done[url] = True

        self.busy.remove(url)
        print(len(self.done), 'completed tasks,', len(self.tasks),
              'still pending, todo', len(self.todo))


--- a/pysitemap/crawler.py
+++ b/pysitemap/crawler.py
@ -1,68 +0,0 @@
 #!/usr/bin/env python3

 import time
 import asyncio
 from aiohttp import ClientSession
 from parsel import Selector
 from urllib.parse import urlparse, urlunparse


 class Crawler(object):
    def __init__(self, url, sleep_time=.5):

        self.urls = [url]
        scheme, netloc, path, params, query, fragment = urlparse(url)
        if not netloc:
            netloc, path = path, netloc
        url = urlunparse((scheme, netloc, "", params, "", fragment))
        self.base_url = url
        self.sleep_time = float(sleep_time)

    async def fetch(self, url, session):
        async with session.get(url) as response:
            await asyncio.sleep(self.sleep_time)
            return response.content

    async def bound_fetch(self, sem, url, session):
        # Getter function with semaphore.
        async with sem:
            await self.fetch(url, session)

    def norm_link(self, url:str):
        if url.startswith(self.base_url):
            return url
        elif url.startswith('//'):
            return "{scheme}{url}".format(
                scheme=self.base_url[:self.base_url.find(":")],
                url=url
            )
        elif url.startswith("/"):
            return self.base_url + url
        return None

    async def parse(self, content):
        sel = Selector(content)
        links = sel.xpath('//a/@href').getall()
        normalized_links = []
        for link in links:
            link = self.norm_link(link)
            if link:
                normalized_links.append(link)
        self.urls.extend(normalized_links)

    async def run(self):
        tasks = []
        # create instance of Semaphore
        sem = asyncio.Semaphore(20)

        # Create client session that will ensure we dont open new connection
        # per each request.
        async with ClientSession() as session:
            for url in self.urls:
                # pass Semaphore and session to every GET request
                task = asyncio.ensure_future(self.bound_fetch(sem, url, session))
                tasks.append(task)

            responses = asyncio.gather(*tasks)
            await responses

--- a/pysitemap/format_processors/init.py
+++ b/pysitemap/format_processors/init.py
--- a/pysitemap/format_processors/xml.py
+++ b/pysitemap/format_processors/xml.py
@ -0,0 +1,26 @@
 import asyncio
 from aiofile import AIOFile, Reader, Writer
 import logging


 class XMLWriter():
    def __init__(self, filename: str):
        self.filename = filename


    async def write(self, urls):
        async with AIOFile(self.filename, 'w') as aiodf:
            writer = Writer(aiodf)
            await writer('<?xml version="1.0" encoding="utf-8"?>\n')
            await writer(
                '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"'
                ' xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"'
                ' xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd">\n')
            await aiodf.fsync()
            for url in urls:
                await writer('<url><loc>{}</loc></url>\n'.format(url))
            await aiodf.fsync()

            await writer('</urlset>')
            await aiodf.fsync()

--- a/run.py
+++ b/run.py
@ -1,20 +1,15 @@
 from pysitemap import Crawler
 import asyncio
 """
 Example script
 Uses gevent to implement multiprocessing if Gevent installed
 To install gevent:
    $ pip install gevent
 """
 import sys
 import logging
 from pysitemap import crawler

 if __name__ == '__main__':
    url = 'http://www.stroivopros.ru/'  # url from to crawl
    logfile = 'errlog.log'  # path to logfile
    oformat = 'xml'  # output format
    outputfile = 'sitemap.xml'  # path to output file

    loop = asyncio.get_event_loop()
    crawler: Crawler = Crawler(url=url)
    future = asyncio.ensure_future(crawler.run())
    loop.run_until_complete(future)
    if '--iocp' in sys.argv:
        from asyncio import events, windows_events
        sys.argv.remove('--iocp')
        logging.info('using iocp')
        el = windows_events.ProactorEventLoop()
        events.set_event_loop(el)

    # root_url = sys.argv[1]
    root_url = 'https://www.haikson.com'
    crawler(root_url, out_file='sitemap.xml')
--- a/sitemap.xml
+++ b/sitemap.xml