Browse Source

aiohttp and aiofile

from-knocker
Kamo Petrosyan 4 years ago
parent
commit
e11e289e5f
7 changed files with 140 additions and 1548 deletions
  1. +27
    -1
      pysitemap/__init__.py
  2. +75
    -0
      pysitemap/base_crawler.py
  3. +0
    -68
      pysitemap/crawler.py
  4. +0
    -0
      pysitemap/format_processors/__init__.py
  5. +26
    -0
      pysitemap/format_processors/xml.py
  6. +12
    -17
      run.py
  7. +0
    -1462
      sitemap.xml

+ 27
- 1
pysitemap/__init__.py View File

@ -1 +1,27 @@
from pysitemap.crawler import Crawler
import asyncio
import signal
from pysitemap.base_crawler import Crawler
def crawler(root_url, out_file, out_format='xml', maxtasks=100):
"""
run crowler
:param root_url: Site root url
:param out_file: path to the out file
:param out_format: format of out file [xml, txt]
:param maxtasks: max count of tasks
:return:
"""
loop = asyncio.get_event_loop()
c = Crawler(root_url, out_file=out_file, out_format=out_format, maxtasks=maxtasks)
loop.run_until_complete(c.run())
try:
loop.add_signal_handler(signal.SIGINT, loop.stop)
except RuntimeError:
pass
print('todo:', len(c.todo))
print('busy:', len(c.busy))
print('done:', len(c.done), '; ok:', sum(c.done.values()))
print('tasks:', len(c.tasks))

+ 75
- 0
pysitemap/base_crawler.py View File

@ -0,0 +1,75 @@
import asyncio
import re
import urllib.parse
from pysitemap.format_processors.xml import XMLWriter
import aiohttp
class Crawler:
format_processors = {
'xml': XMLWriter
}
def __init__(self, rooturl, out_file, out_format='xml', maxtasks=100):
self.rooturl = rooturl
self.todo = set()
self.busy = set()
self.done = {}
self.tasks = set()
self.sem = asyncio.Semaphore(maxtasks)
# connector stores cookies between requests and uses connection pool
self.session = aiohttp.ClientSession()
self.writer = self.format_processors.get(out_format)(out_file)
async def run(self):
t = asyncio.ensure_future(self.addurls([(self.rooturl, '')]))
await asyncio.sleep(1)
while self.busy:
await asyncio.sleep(1)
await t
await self.session.close()
await self.writer.write([key for key, value in self.done.items() if value])
async def addurls(self, urls):
for url, parenturl in urls:
url = urllib.parse.urljoin(parenturl, url)
url, frag = urllib.parse.urldefrag(url)
if (url.startswith(self.rooturl) and
url not in self.busy and
url not in self.done and
url not in self.todo):
self.todo.add(url)
await self.sem.acquire()
task = asyncio.ensure_future(self.process(url))
task.add_done_callback(lambda t: self.sem.release())
task.add_done_callback(self.tasks.remove)
self.tasks.add(task)
async def process(self, url):
print('processing:', url)
self.todo.remove(url)
self.busy.add(url)
try:
resp = await self.session.get(url)
except Exception as exc:
print('...', url, 'has error', repr(str(exc)))
self.done[url] = False
else:
if (resp.status == 200 and
('text/html' in resp.headers.get('content-type'))):
data = (await resp.read()).decode('utf-8', 'replace')
urls = re.findall(r'(?i)href=["\']?([^\s"\'<>]+)', data)
asyncio.Task(self.addurls([(u, url) for u in urls]))
resp.close()
self.done[url] = True
self.busy.remove(url)
print(len(self.done), 'completed tasks,', len(self.tasks),
'still pending, todo', len(self.todo))

+ 0
- 68
pysitemap/crawler.py View File

@ -1,68 +0,0 @@
#!/usr/bin/env python3
import time
import asyncio
from aiohttp import ClientSession
from parsel import Selector
from urllib.parse import urlparse, urlunparse
class Crawler(object):
def __init__(self, url, sleep_time=.5):
self.urls = [url]
scheme, netloc, path, params, query, fragment = urlparse(url)
if not netloc:
netloc, path = path, netloc
url = urlunparse((scheme, netloc, "", params, "", fragment))
self.base_url = url
self.sleep_time = float(sleep_time)
async def fetch(self, url, session):
async with session.get(url) as response:
await asyncio.sleep(self.sleep_time)
return response.content
async def bound_fetch(self, sem, url, session):
# Getter function with semaphore.
async with sem:
await self.fetch(url, session)
def norm_link(self, url:str):
if url.startswith(self.base_url):
return url
elif url.startswith('//'):
return "{scheme}{url}".format(
scheme=self.base_url[:self.base_url.find(":")],
url=url
)
elif url.startswith("/"):
return self.base_url + url
return None
async def parse(self, content):
sel = Selector(content)
links = sel.xpath('//a/@href').getall()
normalized_links = []
for link in links:
link = self.norm_link(link)
if link:
normalized_links.append(link)
self.urls.extend(normalized_links)
async def run(self):
tasks = []
# create instance of Semaphore
sem = asyncio.Semaphore(20)
# Create client session that will ensure we dont open new connection
# per each request.
async with ClientSession() as session:
for url in self.urls:
# pass Semaphore and session to every GET request
task = asyncio.ensure_future(self.bound_fetch(sem, url, session))
tasks.append(task)
responses = asyncio.gather(*tasks)
await responses

+ 0
- 0
pysitemap/format_processors/__init__.py View File


+ 26
- 0
pysitemap/format_processors/xml.py View File

@ -0,0 +1,26 @@
import asyncio
from aiofile import AIOFile, Reader, Writer
import logging
class XMLWriter():
def __init__(self, filename: str):
self.filename = filename
async def write(self, urls):
async with AIOFile(self.filename, 'w') as aiodf:
writer = Writer(aiodf)
await writer('<?xml version="1.0" encoding="utf-8"?>\n')
await writer(
'<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"'
' xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"'
' xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd">\n')
await aiodf.fsync()
for url in urls:
await writer('<url><loc>{}</loc></url>\n'.format(url))
await aiodf.fsync()
await writer('</urlset>')
await aiodf.fsync()

+ 12
- 17
run.py View File

@ -1,20 +1,15 @@
from pysitemap import Crawler
import asyncio
"""
Example script
Uses gevent to implement multiprocessing if Gevent installed
To install gevent:
$ pip install gevent
"""
import sys
import logging
from pysitemap import crawler
if __name__ == '__main__': if __name__ == '__main__':
url = 'http://www.stroivopros.ru/' # url from to crawl
logfile = 'errlog.log' # path to logfile
oformat = 'xml' # output format
outputfile = 'sitemap.xml' # path to output file
loop = asyncio.get_event_loop()
crawler: Crawler = Crawler(url=url)
future = asyncio.ensure_future(crawler.run())
loop.run_until_complete(future)
if '--iocp' in sys.argv:
from asyncio import events, windows_events
sys.argv.remove('--iocp')
logging.info('using iocp')
el = windows_events.ProactorEventLoop()
events.set_event_loop(el)
# root_url = sys.argv[1]
root_url = 'https://www.haikson.com'
crawler(root_url, out_file='sitemap.xml')

+ 0
- 1462
sitemap.xml
File diff suppressed because it is too large
View File


Loading…
Cancel
Save