Browse Source

backend

queue_backend
Kamo Petrosyan 4 years ago
parent
commit
8035674f0c
5 changed files with 11 additions and 9 deletions
  1. +1
    -1
      pysitemap/__init__.py
  2. +0
    -0
      pysitemap/backends/__init__.py
  3. +8
    -7
      pysitemap/base_crawler.py
  4. +1
    -1
      run.py
  5. +1
    -0
      sitemap.xml

+ 1
- 1
pysitemap/__init__.py View File

@ -21,7 +21,7 @@ def crawler(root_url, out_file, out_format='xml', maxtasks=100):
loop.add_signal_handler(signal.SIGINT, loop.stop) loop.add_signal_handler(signal.SIGINT, loop.stop)
except RuntimeError: except RuntimeError:
pass pass
print('todo:', len(c.todo))
print('todo_queue:', len(c.todo_queue))
print('busy:', len(c.busy)) print('busy:', len(c.busy))
print('done:', len(c.done), '; ok:', sum(c.done.values())) print('done:', len(c.done), '; ok:', sum(c.done.values()))
print('tasks:', len(c.tasks)) print('tasks:', len(c.tasks))

+ 0
- 0
pysitemap/backends/__init__.py View File


+ 8
- 7
pysitemap/base_crawler.py View File

@ -14,7 +14,8 @@ class Crawler:
'txt': TextWriter 'txt': TextWriter
} }
def __init__(self, rooturl, out_file, out_format='xml', maxtasks=100):
def __init__(self, rooturl, out_file, out_format='xml', maxtasks=100,
todo_queue_backend=set, done_backend=dict):
""" """
Crawler constructor Crawler constructor
:param rooturl: root url of site :param rooturl: root url of site
@ -27,9 +28,9 @@ class Crawler:
:type maxtasks: int :type maxtasks: int
""" """
self.rooturl = rooturl self.rooturl = rooturl
self.todo = set()
self.todo_queue = todo_queue_backend()
self.busy = set() self.busy = set()
self.done = {}
self.done = done_backend()
self.tasks = set() self.tasks = set()
self.sem = asyncio.Semaphore(maxtasks) self.sem = asyncio.Semaphore(maxtasks)
@ -63,8 +64,8 @@ class Crawler:
if (url.startswith(self.rooturl) and if (url.startswith(self.rooturl) and
url not in self.busy and url not in self.busy and
url not in self.done and url not in self.done and
url not in self.todo):
self.todo.add(url)
url not in self.todo_queue):
self.todo_queue.add(url)
# Acquire semaphore # Acquire semaphore
await self.sem.acquire() await self.sem.acquire()
# Create async task # Create async task
@ -85,7 +86,7 @@ class Crawler:
print('processing:', url) print('processing:', url)
# remove url from basic queue and add it into busy list # remove url from basic queue and add it into busy list
self.todo.remove(url)
self.todo_queue.remove(url)
self.busy.add(url) self.busy.add(url)
try: try:
@ -108,6 +109,6 @@ class Crawler:
self.busy.remove(url) self.busy.remove(url)
logging.info(len(self.done), 'completed tasks,', len(self.tasks), logging.info(len(self.done), 'completed tasks,', len(self.tasks),
'still pending, todo', len(self.todo))
'still pending, todo_queue', len(self.todo_queue))

+ 1
- 1
run.py View File

@ -11,5 +11,5 @@ if __name__ == '__main__':
events.set_event_loop(el) events.set_event_loop(el)
# root_url = sys.argv[1] # root_url = sys.argv[1]
root_url = 'http://www.haikson.ru'
root_url = 'https://www.haikson.ru'
crawler(root_url, out_file='sitemap.xml') crawler(root_url, out_file='sitemap.xml')

+ 1
- 0
sitemap.xml View File

@ -0,0 +1 @@
<?xml version="1.0" encoding="utf-8"?> <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd"> </urlset>

Loading…
Cancel
Save