Browse Source

Merge branch 'queue_backend'

master
Kamo Petrosyan 4 years ago
parent
commit
8c43e6b4e4
8 changed files with 102 additions and 10 deletions
  1. +7
    -0
      README.rst
  2. +1
    -1
      pysitemap/__init__.py
  3. +0
    -0
      pysitemap/backends/__init__.py
  4. +83
    -0
      pysitemap/backends/sqlite_todo.py
  5. +8
    -7
      pysitemap/base_crawler.py
  6. +1
    -1
      run.py
  7. +1
    -0
      sitemap.xml
  8. +1
    -1
      version.py

+ 7
- 0
README.rst View File

@ -54,6 +54,13 @@ TODO
changelog changelog
--------- ---------
v. 0.9.2
''''''''
- todo queue and done list backends
- created very slowest sqlite backend for todo queue and done lists (1000 url writing for 3 minutes)
- tests for sqlite_todo backend
v. 0.9.1 v. 0.9.1
'''''''' ''''''''


+ 1
- 1
pysitemap/__init__.py View File

@ -21,7 +21,7 @@ def crawler(root_url, out_file, out_format='xml', maxtasks=100):
loop.add_signal_handler(signal.SIGINT, loop.stop) loop.add_signal_handler(signal.SIGINT, loop.stop)
except RuntimeError: except RuntimeError:
pass pass
print('todo:', len(c.todo))
print('todo_queue:', len(c.todo_queue))
print('busy:', len(c.busy)) print('busy:', len(c.busy))
print('done:', len(c.done), '; ok:', sum(c.done.values())) print('done:', len(c.done), '; ok:', sum(c.done.values()))
print('tasks:', len(c.tasks)) print('tasks:', len(c.tasks))

+ 0
- 0
pysitemap/backends/__init__.py View File


+ 83
- 0
pysitemap/backends/sqlite_todo.py View File

@ -0,0 +1,83 @@
import logging
import sqlite3
class SQLiteTodo(object):
def __init__(self, db_name):
self.connection = sqlite3.connect(db_name)
self.__init_tables()
def __init_tables(self):
cursor = self.connection.cursor()
cursor.execute("DROP TABLE IF EXISTS todo_queue;")
cursor.execute("""
CREATE TABLE todo_queue (
url text(1000) primary key
);
""")
self.connection.commit()
cursor.close()
def add(self, url):
cursor = self.connection.cursor()
try:
cursor.execute("""insert into todo_queue values (?);""", (url,))
except Exception as e:
logging.info(e)
finally:
self.connection.commit()
cursor.close()
def remove(self, url):
cursor = self.connection.cursor()
try:
cursor.execute("""delete from todo_queue where url = ?;""", (url,))
except Exception as e:
logging.info(e)
finally:
self.connection.commit()
cursor.close()
def __contains__(self, item):
cursor = self.connection.cursor()
result = False
try:
cursor.execute("""select 1 from todo_queue where url = ?""", (item, ))
row = cursor.fetchone()
if len(row):
result = True
except Exception as e:
logging.info(e)
finally:
cursor.close()
return result
def __iter__(self):
cursor = self.connection.cursor()
result = []
try:
cursor.execute("""select url from todo_queue""")
rows = cursor.fetchall()
result = [row[0] for row in rows]
except Exception as e:
logging.info(e)
finally:
cursor.close()
return iter(result)
def __next__(self):
for url in self:
yield url
def __len__(self):
cursor = self.connection.cursor()
result = []
try:
cursor.execute("""select count(*) as cnt from todo_queue""")
row = cursor.fetchone()
result = row[0]
except Exception as e:
logging.info(e)
finally:
cursor.close()
return result

+ 8
- 7
pysitemap/base_crawler.py View File

@ -14,7 +14,8 @@ class Crawler:
'txt': TextWriter 'txt': TextWriter
} }
def __init__(self, rooturl, out_file, out_format='xml', maxtasks=100):
def __init__(self, rooturl, out_file, out_format='xml', maxtasks=100,
todo_queue_backend=set, done_backend=dict):
""" """
Crawler constructor Crawler constructor
:param rooturl: root url of site :param rooturl: root url of site
@ -27,9 +28,9 @@ class Crawler:
:type maxtasks: int :type maxtasks: int
""" """
self.rooturl = rooturl self.rooturl = rooturl
self.todo = set()
self.todo_queue = todo_queue_backend()
self.busy = set() self.busy = set()
self.done = {}
self.done = done_backend()
self.tasks = set() self.tasks = set()
self.sem = asyncio.Semaphore(maxtasks) self.sem = asyncio.Semaphore(maxtasks)
@ -63,8 +64,8 @@ class Crawler:
if (url.startswith(self.rooturl) and if (url.startswith(self.rooturl) and
url not in self.busy and url not in self.busy and
url not in self.done and url not in self.done and
url not in self.todo):
self.todo.add(url)
url not in self.todo_queue):
self.todo_queue.add(url)
# Acquire semaphore # Acquire semaphore
await self.sem.acquire() await self.sem.acquire()
# Create async task # Create async task
@ -85,7 +86,7 @@ class Crawler:
print('processing:', url) print('processing:', url)
# remove url from basic queue and add it into busy list # remove url from basic queue and add it into busy list
self.todo.remove(url)
self.todo_queue.remove(url)
self.busy.add(url) self.busy.add(url)
try: try:
@ -108,6 +109,6 @@ class Crawler:
self.busy.remove(url) self.busy.remove(url)
logging.info(len(self.done), 'completed tasks,', len(self.tasks), logging.info(len(self.done), 'completed tasks,', len(self.tasks),
'still pending, todo', len(self.todo))
'still pending, todo_queue', len(self.todo_queue))

+ 1
- 1
run.py View File

@ -11,5 +11,5 @@ if __name__ == '__main__':
events.set_event_loop(el) events.set_event_loop(el)
# root_url = sys.argv[1] # root_url = sys.argv[1]
root_url = 'http://www.haikson.ru'
root_url = 'https://www.metpromstroi.ru'
crawler(root_url, out_file='sitemap.xml') crawler(root_url, out_file='sitemap.xml')

+ 1
- 0
sitemap.xml
File diff suppressed because it is too large
View File


+ 1
- 1
version.py View File

@ -1 +1 @@
VERSION = '0.9.1'
VERSION = '0.9.2'

Loading…
Cancel
Save