Merge branch 'queue_backend'

4 years ago · 8c43e6b4e4
--- a/README.rst
+++ b/README.rst
@ -54,6 +54,13 @@ TODO
 changelog
 ---------

 v. 0.9.2
 ''''''''

 -  todo queue and done list backends
 -  created very slowest sqlite backend for todo queue and done lists (1000 url writing for 3 minutes)
 -  tests for sqlite_todo backend

 v. 0.9.1
 ''''''''

--- a/pysitemap/init.py
+++ b/pysitemap/init.py
@ -21,7 +21,7 @@ def crawler(root_url, out_file, out_format='xml', maxtasks=100):
        loop.add_signal_handler(signal.SIGINT, loop.stop)
    except RuntimeError:
        pass
    print('todo:', len(c.todo))
    print('todo_queue:', len(c.todo_queue))
    print('busy:', len(c.busy))
    print('done:', len(c.done), '; ok:', sum(c.done.values()))
    print('tasks:', len(c.tasks))
--- a/pysitemap/backends/init.py
+++ b/pysitemap/backends/init.py
--- a/pysitemap/backends/sqlite_todo.py
+++ b/pysitemap/backends/sqlite_todo.py
@ -0,0 +1,83 @@
 import logging
 import sqlite3


 class SQLiteTodo(object):
    def __init__(self, db_name):
        self.connection = sqlite3.connect(db_name)
        self.__init_tables()

    def __init_tables(self):
        cursor = self.connection.cursor()
        cursor.execute("DROP TABLE IF EXISTS todo_queue;")
        cursor.execute("""
            CREATE TABLE todo_queue (
                url text(1000) primary key
            );
        """)
        self.connection.commit()
        cursor.close()

    def add(self, url):
        cursor = self.connection.cursor()
        try:
            cursor.execute("""insert into todo_queue values (?);""", (url,))
        except Exception as e:
            logging.info(e)
        finally:
            self.connection.commit()
            cursor.close()

    def remove(self, url):
        cursor = self.connection.cursor()
        try:
            cursor.execute("""delete from todo_queue where url = ?;""", (url,))
        except Exception as e:
            logging.info(e)
        finally:
            self.connection.commit()
            cursor.close()

    def __contains__(self, item):
        cursor = self.connection.cursor()
        result = False
        try:
            cursor.execute("""select 1 from todo_queue where url = ?""", (item, ))
            row = cursor.fetchone()
            if len(row):
                result = True
        except Exception as e:
            logging.info(e)
        finally:
            cursor.close()
        return result

    def __iter__(self):
        cursor = self.connection.cursor()
        result = []
        try:
            cursor.execute("""select url from todo_queue""")
            rows = cursor.fetchall()
            result = [row[0] for row in rows]
        except Exception as e:
            logging.info(e)
        finally:
            cursor.close()
        return iter(result)

    def __next__(self):
        for url in self:
            yield url

    def __len__(self):
        cursor = self.connection.cursor()
        result = []
        try:
            cursor.execute("""select count(*) as cnt from todo_queue""")
            row = cursor.fetchone()
            result = row[0]
        except Exception as e:
            logging.info(e)
        finally:
            cursor.close()
        return result
--- a/pysitemap/base_crawler.py
+++ b/pysitemap/base_crawler.py
@ -14,7 +14,8 @@ class Crawler:
        'txt': TextWriter
    }

    def __init__(self, rooturl, out_file, out_format='xml', maxtasks=100):
    def __init__(self, rooturl, out_file, out_format='xml', maxtasks=100,
                 todo_queue_backend=set, done_backend=dict):
        """
        Crawler constructor
        :param rooturl: root url of site
@ -27,9 +28,9 @@ class Crawler:
        :type maxtasks: int
        """
        self.rooturl = rooturl
        self.todo = set()
        self.todo_queue = todo_queue_backend()
        self.busy = set()
        self.done = {}
        self.done = done_backend()
        self.tasks = set()
        self.sem = asyncio.Semaphore(maxtasks)

@ -63,8 +64,8 @@ class Crawler:
            if (url.startswith(self.rooturl) and
                    url not in self.busy and
                    url not in self.done and
                    url not in self.todo):
                self.todo.add(url)
                    url not in self.todo_queue):
                self.todo_queue.add(url)
                # Acquire semaphore
                await self.sem.acquire()
                # Create async task
@ -85,7 +86,7 @@ class Crawler:
        print('processing:', url)

        # remove url from basic queue and add it into busy list
        self.todo.remove(url)
        self.todo_queue.remove(url)
        self.busy.add(url)

        try:
@ -108,6 +109,6 @@ class Crawler:

        self.busy.remove(url)
        logging.info(len(self.done), 'completed tasks,', len(self.tasks),
              'still pending, todo', len(self.todo))
              'still pending, todo_queue', len(self.todo_queue))


--- a/run.py
+++ b/run.py
@ -11,5 +11,5 @@ if __name__ == '__main__':
        events.set_event_loop(el)

    # root_url = sys.argv[1]
    root_url = 'http://www.haikson.ru'
    root_url = 'https://www.metpromstroi.ru'
    crawler(root_url, out_file='sitemap.xml')
--- a/sitemap.xml
+++ b/sitemap.xml
--- a/version.py
+++ b/version.py
@ -1 +1 @@
 VERSION = '0.9.1'
 VERSION = '0.9.2'