Browse Source

Add v.0.9.3 features

master
Pekka Helenius 4 years ago
parent
commit
2d232c6b09
3 changed files with 102 additions and 19 deletions
  1. +19
    -5
      pysitemap/__init__.py
  2. +62
    -9
      pysitemap/base_crawler.py
  3. +21
    -5
      pysitemap/format_processors/xml.py

+ 19
- 5
pysitemap/__init__.py View File

@ -1,20 +1,34 @@
import asyncio
kimport asyncio
import signal
from pysitemap.base_crawler import Crawler
def crawler(root_url, out_file, out_format='xml', maxtasks=100):
def crawler(
root_url, out_file, out_format='xml',
maxtasks=10, exclude_urls=[], verifyssl=True,
headers=None, timezone_offset=0, changefreq=None,
priorities=None):
"""
run crowler
:param root_url: Site root url
:param out_file: path to the out file
:param out_format: format of out file [xml, txt]
:param maxtasks: max count of tasks
:param exclude_urls: excludable url paths
:param verifyssl: verify website certificate?
:param headers: Send these headers in every request
:param timezone_offset: timezone offset for lastmod tags
:param changefreq: dictionary, where key is site sub url regex, and value is changefreq
:param priorities: dictionary, where key is site sub url regex, and value is priority float
:return:
"""
loop = asyncio.get_event_loop()
c = Crawler(root_url, out_file=out_file, out_format=out_format, maxtasks=maxtasks)
c = Crawler(root_url, out_file=out_file, out_format=out_format,
maxtasks=maxtasks, exclude_urls=exclude_urls, verifyssl=verifyssl,
headers=headers, timezone_offset=timezone_offset,
changefreq=changefreq, priorities=priorities)
loop.run_until_complete(c.run())
try:
@ -23,5 +37,5 @@ def crawler(root_url, out_file, out_format='xml', maxtasks=100):
pass
print('todo_queue:', len(c.todo_queue))
print('busy:', len(c.busy))
print('done:', len(c.done), '; ok:', sum(c.done.values()))
print('tasks:', len(c.tasks))
print('done:', len(c.done), '; ok:', sum(list(zip(*c.done.values()))[0]) )
print('tasks:', len(c.tasks))

+ 62
- 9
pysitemap/base_crawler.py View File

@ -1,4 +1,4 @@
import logging
kimport logging
import asyncio
import re
import urllib.parse
@ -14,7 +14,8 @@ class Crawler:
'txt': TextWriter
}
def __init__(self, rooturl, out_file, out_format='xml', maxtasks=100,
def __init__(self, rooturl, out_file, out_format='xml', maxtasks=10, exclude_urls=[], verifyssl=True,
headers=None, timezone_offset=0, changefreq=None, priorities=None,
todo_queue_backend=set, done_backend=dict):
"""
Crawler constructor
@ -24,18 +25,35 @@ class Crawler:
:type out_file: str
:param out_format: sitemap type [xml | txt]. Default xml
:type out_format: str
:param maxtasks: maximum count of tasks. Default 100
:param maxtasks: maximum count of tasks. Default 10
:type maxtasks: int
:param exclude_urls: excludable url paths relative to root url
:type exclude_urls: list
:param verifyssl: verify website certificate?
:type verifyssl: boolean
:param timezone_offset: timezone offset for lastmod tags
:type timezone_offset: int
:param changefreq: dictionary, where key is site sub url regex, and value is changefreq
:type changefreq: dict
:param priorities: dictionary, where key is site sub url regex, and value is priority float
:type priorities: dict
"""
self.rooturl = rooturl
self.exclude_urls = exclude_urls
self.todo_queue = todo_queue_backend()
self.busy = set()
self.done = done_backend()
self.tasks = set()
self.sem = asyncio.Semaphore(maxtasks)
self.timezone_offset = timezone_offset
self.changefreq = changefreq
self.priorities = priorities
# connector stores cookies between requests and uses connection pool
self.session = aiohttp.ClientSession()
self.session = aiohttp.ClientSession(
headers=headers,
connector=aiohttp.TCPConnector(verify_ssl=verifyssl)
)
self.writer = self.format_processors.get(out_format)(out_file)
async def run(self):
@ -50,7 +68,29 @@ class Crawler:
await t
await self.session.close()
await self.writer.write([key for key, value in self.done.items() if value])
await self.writer.write([(key, value) for key, value in self.done.items() if key and value], self.timezone_offset)
async def contains(self, url, regex, rlist=True):
"""
Does url path matches a value in regex_list?
"""
retvalue = False
if rlist:
for exc in regex:
retvalue = bool(re.search(re.compile(r"{}".format(exc)), url))
if retvalue: return retvalue
else:
retvalue = bool(re.search(re.compile(r"{}".format(regex)), url))
return retvalue
async def urldict(self, url, url_dict):
"""
Parse URL regex (key) and value pairs
"""
for urlkey, regvalue in url_dict.items():
if await self.contains(url, urlkey, rlist=False):
return regvalue
return None
async def addurls(self, urls):
"""
@ -61,7 +101,9 @@ class Crawler:
for url, parenturl in urls:
url = urllib.parse.urljoin(parenturl, url)
url, frag = urllib.parse.urldefrag(url)
if (url.startswith(self.rooturl) and
not await self.contains(url, self.exclude_urls, rlist=True) and
url not in self.busy and
url not in self.done and
url not in self.todo_queue):
@ -89,26 +131,37 @@ class Crawler:
self.todo_queue.remove(url)
self.busy.add(url)
lastmod = None
cf = None
pr = None
try:
resp = await self.session.get(url) # await response
except Exception as exc:
# on any exception mark url as BAD
print('...', url, 'has error', repr(str(exc)))
self.done[url] = False
self.done[url] = [False, lastmod, cf, pr]
else:
# only url with status == 200 and content type == 'text/html' parsed
if (resp.status == 200 and
('text/html' in resp.headers.get('content-type'))):
data = (await resp.read()).decode('utf-8', 'replace')
urls = re.findall(r'(?i)href=["\']?([^\s"\'<>]+)', data)
lastmod = resp.headers.get('last-modified')
asyncio.Task(self.addurls([(u, url) for u in urls]))
try: pr = await self.urldict(url, self.changefreq)
except IndexError: pass
try: cf = await self.urldict(url, self.priorities)
except IndexError: pass
# even if we have no exception, we can mark url as good
resp.close()
self.done[url] = True
self.done[url] = [True, lastmod, cf, pr]
self.busy.remove(url)
logging.info(len(self.done), 'completed tasks,', len(self.tasks),
'still pending, todo_queue', len(self.todo_queue))

+ 21
- 5
pysitemap/format_processors/xml.py View File

@ -1,14 +1,14 @@
import asyncio
from aiofile import AIOFile, Reader, Writer
import logging
from datetime import datetime, timezone, timedelta
class XMLWriter():
def __init__(self, filename: str):
self.filename = filename
async def write(self, urls):
async def write(self, urls, timezone_offset):
async with AIOFile(self.filename, 'w') as aiodf:
writer = Writer(aiodf)
await writer('<?xml version="1.0" encoding="utf-8"?>\n')
@ -17,10 +17,26 @@ class XMLWriter():
' xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"'
' xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd">\n')
await aiodf.fsync()
for url in urls:
await writer('<url><loc>{}</loc></url>\n'.format(url))
for data in urls:
timestamp = data[1][1]
changefreq = data[1][2]
priority = data[1][3]
url = "<loc>{}</loc>".format(data[0])
if timestamp is not None:
timestamp = datetime.strptime(timestamp, "%a, %d %b %Y %H:%M:%S %Z").astimezone(tz=timezone(timedelta(hours=timezone_offset))).isoformat()
url += "<lastmod>{}</lastmod>".format(str(timestamp))
if changefreq is not None:
url += "<changefreq>{}</changefreq>".format(str(changefreq))
if priority is not None:
url += "<priority>{}</priority>".format(str(priority))
await writer('<url>{}</url>\n'.format(url))
await aiodf.fsync()
await writer('</urlset>')
await aiodf.fsync()

Loading…
Cancel
Save