From 8035674f0c47e64380cf59726523fde085141cfe Mon Sep 17 00:00:00 2001 From: Kamo Petrosyan Date: Wed, 19 Feb 2020 10:26:59 +0300 Subject: [PATCH] backend --- pysitemap/__init__.py | 2 +- pysitemap/backends/__init__.py | 0 pysitemap/base_crawler.py | 15 ++++++++------- run.py | 2 +- sitemap.xml | 1 + 5 files changed, 11 insertions(+), 9 deletions(-) create mode 100644 pysitemap/backends/__init__.py diff --git a/pysitemap/__init__.py b/pysitemap/__init__.py index 2729498..afbfe71 100644 --- a/pysitemap/__init__.py +++ b/pysitemap/__init__.py @@ -21,7 +21,7 @@ def crawler(root_url, out_file, out_format='xml', maxtasks=100): loop.add_signal_handler(signal.SIGINT, loop.stop) except RuntimeError: pass - print('todo:', len(c.todo)) + print('todo_queue:', len(c.todo_queue)) print('busy:', len(c.busy)) print('done:', len(c.done), '; ok:', sum(c.done.values())) print('tasks:', len(c.tasks)) \ No newline at end of file diff --git a/pysitemap/backends/__init__.py b/pysitemap/backends/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pysitemap/base_crawler.py b/pysitemap/base_crawler.py index 30ab28f..d282c32 100644 --- a/pysitemap/base_crawler.py +++ b/pysitemap/base_crawler.py @@ -14,7 +14,8 @@ class Crawler: 'txt': TextWriter } - def __init__(self, rooturl, out_file, out_format='xml', maxtasks=100): + def __init__(self, rooturl, out_file, out_format='xml', maxtasks=100, + todo_queue_backend=set, done_backend=dict): """ Crawler constructor :param rooturl: root url of site @@ -27,9 +28,9 @@ class Crawler: :type maxtasks: int """ self.rooturl = rooturl - self.todo = set() + self.todo_queue = todo_queue_backend() self.busy = set() - self.done = {} + self.done = done_backend() self.tasks = set() self.sem = asyncio.Semaphore(maxtasks) @@ -63,8 +64,8 @@ class Crawler: if (url.startswith(self.rooturl) and url not in self.busy and url not in self.done and - url not in self.todo): - self.todo.add(url) + url not in self.todo_queue): + self.todo_queue.add(url) # Acquire semaphore await self.sem.acquire() # Create async task @@ -85,7 +86,7 @@ class Crawler: print('processing:', url) # remove url from basic queue and add it into busy list - self.todo.remove(url) + self.todo_queue.remove(url) self.busy.add(url) try: @@ -108,6 +109,6 @@ class Crawler: self.busy.remove(url) logging.info(len(self.done), 'completed tasks,', len(self.tasks), - 'still pending, todo', len(self.todo)) + 'still pending, todo_queue', len(self.todo_queue)) diff --git a/run.py b/run.py index b19ee9c..3b79fca 100644 --- a/run.py +++ b/run.py @@ -11,5 +11,5 @@ if __name__ == '__main__': events.set_event_loop(el) # root_url = sys.argv[1] - root_url = 'http://www.haikson.ru' + root_url = 'https://www.haikson.ru' crawler(root_url, out_file='sitemap.xml') \ No newline at end of file diff --git a/sitemap.xml b/sitemap.xml index e69de29..7d361c4 100644 --- a/sitemap.xml +++ b/sitemap.xml @@ -0,0 +1 @@ + \ No newline at end of file