diff --git a/README.rst b/README.rst index 146f689..c629967 100644 --- a/README.rst +++ b/README.rst @@ -54,6 +54,13 @@ TODO changelog --------- +v. 0.9.2 +'''''''' + +- todo queue and done list backends +- created very slowest sqlite backend for todo queue and done lists (1000 url writing for 3 minutes) +- tests for sqlite_todo backend + v. 0.9.1 '''''''' diff --git a/pysitemap/__init__.py b/pysitemap/__init__.py index 2729498..afbfe71 100644 --- a/pysitemap/__init__.py +++ b/pysitemap/__init__.py @@ -21,7 +21,7 @@ def crawler(root_url, out_file, out_format='xml', maxtasks=100): loop.add_signal_handler(signal.SIGINT, loop.stop) except RuntimeError: pass - print('todo:', len(c.todo)) + print('todo_queue:', len(c.todo_queue)) print('busy:', len(c.busy)) print('done:', len(c.done), '; ok:', sum(c.done.values())) print('tasks:', len(c.tasks)) \ No newline at end of file diff --git a/pysitemap/backends/__init__.py b/pysitemap/backends/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pysitemap/backends/sqlite_todo.py b/pysitemap/backends/sqlite_todo.py new file mode 100644 index 0000000..689c4e6 --- /dev/null +++ b/pysitemap/backends/sqlite_todo.py @@ -0,0 +1,83 @@ +import logging +import sqlite3 + + +class SQLiteTodo(object): + def __init__(self, db_name): + self.connection = sqlite3.connect(db_name) + self.__init_tables() + + def __init_tables(self): + cursor = self.connection.cursor() + cursor.execute("DROP TABLE IF EXISTS todo_queue;") + cursor.execute(""" + CREATE TABLE todo_queue ( + url text(1000) primary key + ); + """) + self.connection.commit() + cursor.close() + + def add(self, url): + cursor = self.connection.cursor() + try: + cursor.execute("""insert into todo_queue values (?);""", (url,)) + except Exception as e: + logging.info(e) + finally: + self.connection.commit() + cursor.close() + + def remove(self, url): + cursor = self.connection.cursor() + try: + cursor.execute("""delete from todo_queue where url = ?;""", (url,)) + except Exception as e: + logging.info(e) + finally: + self.connection.commit() + cursor.close() + + def __contains__(self, item): + cursor = self.connection.cursor() + result = False + try: + cursor.execute("""select 1 from todo_queue where url = ?""", (item, )) + row = cursor.fetchone() + if len(row): + result = True + except Exception as e: + logging.info(e) + finally: + cursor.close() + return result + + def __iter__(self): + cursor = self.connection.cursor() + result = [] + try: + cursor.execute("""select url from todo_queue""") + rows = cursor.fetchall() + result = [row[0] for row in rows] + except Exception as e: + logging.info(e) + finally: + cursor.close() + return iter(result) + + def __next__(self): + for url in self: + yield url + + def __len__(self): + cursor = self.connection.cursor() + result = [] + try: + cursor.execute("""select count(*) as cnt from todo_queue""") + row = cursor.fetchone() + result = row[0] + except Exception as e: + logging.info(e) + finally: + cursor.close() + return result \ No newline at end of file diff --git a/pysitemap/base_crawler.py b/pysitemap/base_crawler.py index 30ab28f..d282c32 100644 --- a/pysitemap/base_crawler.py +++ b/pysitemap/base_crawler.py @@ -14,7 +14,8 @@ class Crawler: 'txt': TextWriter } - def __init__(self, rooturl, out_file, out_format='xml', maxtasks=100): + def __init__(self, rooturl, out_file, out_format='xml', maxtasks=100, + todo_queue_backend=set, done_backend=dict): """ Crawler constructor :param rooturl: root url of site @@ -27,9 +28,9 @@ class Crawler: :type maxtasks: int """ self.rooturl = rooturl - self.todo = set() + self.todo_queue = todo_queue_backend() self.busy = set() - self.done = {} + self.done = done_backend() self.tasks = set() self.sem = asyncio.Semaphore(maxtasks) @@ -63,8 +64,8 @@ class Crawler: if (url.startswith(self.rooturl) and url not in self.busy and url not in self.done and - url not in self.todo): - self.todo.add(url) + url not in self.todo_queue): + self.todo_queue.add(url) # Acquire semaphore await self.sem.acquire() # Create async task @@ -85,7 +86,7 @@ class Crawler: print('processing:', url) # remove url from basic queue and add it into busy list - self.todo.remove(url) + self.todo_queue.remove(url) self.busy.add(url) try: @@ -108,6 +109,6 @@ class Crawler: self.busy.remove(url) logging.info(len(self.done), 'completed tasks,', len(self.tasks), - 'still pending, todo', len(self.todo)) + 'still pending, todo_queue', len(self.todo_queue)) diff --git a/run.py b/run.py index b19ee9c..a8acbbe 100644 --- a/run.py +++ b/run.py @@ -11,5 +11,5 @@ if __name__ == '__main__': events.set_event_loop(el) # root_url = sys.argv[1] - root_url = 'http://www.haikson.ru' + root_url = 'https://www.metpromstroi.ru' crawler(root_url, out_file='sitemap.xml') \ No newline at end of file diff --git a/sitemap.xml b/sitemap.xml index e69de29..23d5e1d 100644 --- a/sitemap.xml +++ b/sitemap.xml @@ -0,0 +1 @@ + https://www.metpromstroi.ru https://www.metpromstroi.ru/wp-content/plugins/otter-blocks/vendor/codeinwp/gutenberg-animation/assets/css/style.css?ver=5.3.2 https://www.metpromstroi.ru/wp-content/plugins/embed-pdf-viewer/css/embed-pdf-viewer.css?ver=5.3.2 https://www.metpromstroi.ru/wp-includes/css/dist/block-library/style.min.css?ver=5.3.2 https://www.metpromstroi.ru/wp-content/plugins/contact-form-7/includes/css/styles.css?ver=5.1.6 https://www.metpromstroi.ru/wp-includes/wlwmanifest.xml https://www.metpromstroi.ru/wp-content/plugins/otter-blocks/vendor/codeinwp/gutenberg-blocks/build/style.css?ver=1.3.4 https://www.metpromstroi.ru/wp-content/plugins/otter-blocks/vendor/codeinwp/gutenberg-animation/assets/css/animate.min.css?ver=5.3.2 https://www.metpromstroi.ru/wp-content/uploads/2019/05/cropped-angar2104191212-e1558883104684-32x32.jpg https://www.metpromstroi.ru/wp-content/uploads/2019/05/cropped-angar2104191212-e1558883104684-192x192.jpg https://www.metpromstroi.ru/wp-content/uploads/2019/05/cropped-angar2104191212-e1558883104684-180x180.jpg https://www.metpromstroi.ru/wp-content/themes/neve/style.min.css?ver=2.5.4 https://www.metpromstroi.ru/ https://www.metpromstroi.ru/feed/ https://www.metpromstroi.ru/comments/feed/ https://www.metpromstroi.ru/xmlrpc.php?rsd https://www.metpromstroi.ru/wp-json/ https://www.metpromstroi.ru/wp-json/oembed/1.0/embed?url=https%3A%2F%2Fwww.metpromstroi.ru%2F https://www.metpromstroi.ru/category/proizvodstvo/ https://www.metpromstroi.ru/category/company/ https://www.metpromstroi.ru/obratnaya-svyaz/ https://www.metpromstroi.ru/angary-i-sklady_pg/ https://www.metpromstroi.ru/kontakty/ https://www.metpromstroi.ru/zhivotnovodcheskie-kompleksy-i-fermy_pg/ https://www.metpromstroi.ru/sklady-i-raspredelitelnye-tsentry_pg/ https://www.metpromstroi.ru/avtomojki-i-sto_pg/ https://www.metpromstroi.ru/zernohranilishha_pg/ https://www.metpromstroi.ru/category/proizvodstvo/feed/ https://www.metpromstroi.ru/kioski-i-torgovye-pavilony_pg/ https://www.metpromstroi.ru/author/haikson/ https://www.metpromstroi.ru/category/company/feed/ https://www.metpromstroi.ru/svidetelstvo-sro_pg/ https://www.metpromstroi.ru/wp-json/oembed/1.0/embed?url=https%3A%2F%2Fwww.metpromstroi.ru%2Fobratnaya-svyaz%2F https://www.metpromstroi.ru/angary-i-sklady_pg/feed/ https://www.metpromstroi.ru/wp-json/oembed/1.0/embed?url=https%3A%2F%2Fwww.metpromstroi.ru%2Fangary-i-sklady_pg%2F https://www.metpromstroi.ru/wp-json/oembed/1.0/embed?url=https%3A%2F%2Fwww.metpromstroi.ru%2Fkontakty%2F https://www.metpromstroi.ru/zhivotnovodcheskie-kompleksy-i-fermy_pg/feed/ https://www.metpromstroi.ru/wp-json/oembed/1.0/embed?url=https%3A%2F%2Fwww.metpromstroi.ru%2Fzhivotnovodcheskie-kompleksy-i-fermy_pg%2F https://www.metpromstroi.ru/xmlrpc.php https://www.metpromstroi.ru/sklady-i-raspredelitelnye-tsentry_pg/feed/ https://www.metpromstroi.ru/avtomojki-i-sto_pg/feed/ https://www.metpromstroi.ru/zernohranilishha_pg/feed/ https://www.metpromstroi.ru/wp-json/oembed/1.0/embed?url=https%3A%2F%2Fwww.metpromstroi.ru%2Fsklady-i-raspredelitelnye-tsentry_pg%2F https://www.metpromstroi.ru/wp-json/oembed/1.0/embed?url=https%3A%2F%2Fwww.metpromstroi.ru%2Favtomojki-i-sto_pg%2F https://www.metpromstroi.ru/kioski-i-torgovye-pavilony_pg/feed/ https://www.metpromstroi.ru/wp-json/oembed/1.0/embed?url=https%3A%2F%2Fwww.metpromstroi.ru%2Fzernohranilishha_pg%2F https://www.metpromstroi.ru/wp-json/oembed/1.0/embed?url=https%3A%2F%2Fwww.metpromstroi.ru%2Fkioski-i-torgovye-pavilony_pg%2F https://www.metpromstroi.ru/author/haikson/feed/ https://www.metpromstroi.ru/svidetelstvo-sro_pg/feed/ https://www.metpromstroi.ru/wp-json/oembed/1.0/embed?url=https%3A%2F%2Fwww.metpromstroi.ru%2Fsvidetelstvo-sro_pg%2F https://www.metpromstroi.ru/?p=66 https://www.metpromstroi.ru/?p=1 https://www.metpromstroi.ru/?p=21 https://www.metpromstroi.ru/?p=46 https://www.metpromstroi.ru/?p=100 https://www.metpromstroi.ru/?p=53 https://www.metpromstroi.ru/?p=39 https://www.metpromstroi.ru/?p=61 https://www.metpromstroi.ru/?p=22 \ No newline at end of file diff --git a/version.py b/version.py index 7e91176..4d11208 100644 --- a/version.py +++ b/version.py @@ -1 +1 @@ -VERSION = '0.9.1' \ No newline at end of file +VERSION = '0.9.2' \ No newline at end of file