Browse Source

docstring and comments for code

queue_backend
Kamo Petrosyan 4 years ago
parent
commit
75c4770b12
5 changed files with 47 additions and 5 deletions
  1. +1
    -0
      .gitignore
  2. +6
    -0
      README.md
  3. +38
    -2
      pysitemap/base_crawler.py
  4. +2
    -2
      run.py
  5. +0
    -1
      setup.py

+ 1
- 0
.gitignore View File

@ -1,6 +1,7 @@
.idea/
tests/
errors.txt
.pypirc
# Byte-compiled / optimized / DLL files
__pycache__/


+ 6
- 0
README.md View File

@ -4,6 +4,12 @@ Sitemap generator
## installing
pip install sitemap-generator
## requirements
asyncio
aiofile
aiohttp
## example


+ 38
- 2
pysitemap/base_crawler.py View File

@ -1,3 +1,4 @@
import logging
import asyncio
import re
import urllib.parse
@ -14,6 +15,17 @@ class Crawler:
}
def __init__(self, rooturl, out_file, out_format='xml', maxtasks=100):
"""
Crawler constructor
:param rooturl: root url of site
:type rooturl: str
:param out_file: file to save sitemap result
:type out_file: str
:param out_format: sitemap type [xml | txt]. Default xml
:type out_format: str
:param maxtasks: maximum count of tasks. Default 100
:type maxtasks: int
"""
self.rooturl = rooturl
self.todo = set()
self.busy = set()
@ -26,6 +38,10 @@ class Crawler:
self.writer = self.format_processors.get(out_format)(out_file)
async def run(self):
"""
Main function to start parsing site
:return:
"""
t = asyncio.ensure_future(self.addurls([(self.rooturl, '')]))
await asyncio.sleep(1)
while self.busy:
@ -36,6 +52,11 @@ class Crawler:
await self.writer.write([key for key, value in self.done.items() if value])
async def addurls(self, urls):
"""
Add urls in queue and run process to parse
:param urls:
:return:
"""
for url, parenturl in urls:
url = urllib.parse.urljoin(parenturl, url)
url, frag = urllib.parse.urldefrag(url)
@ -44,34 +65,49 @@ class Crawler:
url not in self.done and
url not in self.todo):
self.todo.add(url)
# Acquire semaphore
await self.sem.acquire()
# Create async task
task = asyncio.ensure_future(self.process(url))
# Add collback into task to release semaphore
task.add_done_callback(lambda t: self.sem.release())
# Callback to remove task from tasks
task.add_done_callback(self.tasks.remove)
# Add task into tasks
self.tasks.add(task)
async def process(self, url):
"""
Process single url
:param url:
:return:
"""
print('processing:', url)
# remove url from basic queue and add it into busy list
self.todo.remove(url)
self.busy.add(url)
try:
resp = await self.session.get(url)
resp = await self.session.get(url) # await response
except Exception as exc:
# on any exception mark url as BAD
print('...', url, 'has error', repr(str(exc)))
self.done[url] = False
else:
# only url with status == 200 and content type == 'text/html' parsed
if (resp.status == 200 and
('text/html' in resp.headers.get('content-type'))):
data = (await resp.read()).decode('utf-8', 'replace')
urls = re.findall(r'(?i)href=["\']?([^\s"\'<>]+)', data)
asyncio.Task(self.addurls([(u, url) for u in urls]))
# even if we have no exception, we can mark url as good
resp.close()
self.done[url] = True
self.busy.remove(url)
print(len(self.done), 'completed tasks,', len(self.tasks),
logging.info(len(self.done), 'completed tasks,', len(self.tasks),
'still pending, todo', len(self.todo))

+ 2
- 2
run.py View File

@ -11,5 +11,5 @@ if __name__ == '__main__':
events.set_event_loop(el)
# root_url = sys.argv[1]
root_url = 'https://www.haikson.com'
crawler(root_url, out_file='sitemap.xml')
root_url = 'http://www.techelec.ru'
crawler(root_url, out_file='techelec.xml')

+ 0
- 1
setup.py View File

@ -1,4 +1,3 @@
from distutils.core import setup
from setuptools import find_packages, setup
EXCLUDE_FROM_PACKAGES = ['tests',]


Loading…
Cancel
Save