diff --git a/pysitemap/__init__.py b/pysitemap/__init__.py
index afbfe71..66df066 100644
--- a/pysitemap/__init__.py
+++ b/pysitemap/__init__.py
@@ -1,20 +1,34 @@
-import asyncio
+kimport asyncio
import signal
from pysitemap.base_crawler import Crawler
-def crawler(root_url, out_file, out_format='xml', maxtasks=100):
+def crawler(
+ root_url, out_file, out_format='xml',
+ maxtasks=10, exclude_urls=[], verifyssl=True,
+ headers=None, timezone_offset=0, changefreq=None,
+ priorities=None):
"""
run crowler
:param root_url: Site root url
:param out_file: path to the out file
:param out_format: format of out file [xml, txt]
:param maxtasks: max count of tasks
+ :param exclude_urls: excludable url paths
+ :param verifyssl: verify website certificate?
+ :param headers: Send these headers in every request
+ :param timezone_offset: timezone offset for lastmod tags
+ :param changefreq: dictionary, where key is site sub url regex, and value is changefreq
+ :param priorities: dictionary, where key is site sub url regex, and value is priority float
:return:
"""
loop = asyncio.get_event_loop()
- c = Crawler(root_url, out_file=out_file, out_format=out_format, maxtasks=maxtasks)
+ c = Crawler(root_url, out_file=out_file, out_format=out_format,
+ maxtasks=maxtasks, exclude_urls=exclude_urls, verifyssl=verifyssl,
+ headers=headers, timezone_offset=timezone_offset,
+ changefreq=changefreq, priorities=priorities)
+
loop.run_until_complete(c.run())
try:
@@ -23,5 +37,5 @@ def crawler(root_url, out_file, out_format='xml', maxtasks=100):
pass
print('todo_queue:', len(c.todo_queue))
print('busy:', len(c.busy))
- print('done:', len(c.done), '; ok:', sum(c.done.values()))
- print('tasks:', len(c.tasks))
\ No newline at end of file
+ print('done:', len(c.done), '; ok:', sum(list(zip(*c.done.values()))[0]) )
+ print('tasks:', len(c.tasks))
diff --git a/pysitemap/base_crawler.py b/pysitemap/base_crawler.py
index d282c32..cf54c79 100644
--- a/pysitemap/base_crawler.py
+++ b/pysitemap/base_crawler.py
@@ -1,4 +1,4 @@
-import logging
+kimport logging
import asyncio
import re
import urllib.parse
@@ -14,7 +14,8 @@ class Crawler:
'txt': TextWriter
}
- def __init__(self, rooturl, out_file, out_format='xml', maxtasks=100,
+ def __init__(self, rooturl, out_file, out_format='xml', maxtasks=10, exclude_urls=[], verifyssl=True,
+ headers=None, timezone_offset=0, changefreq=None, priorities=None,
todo_queue_backend=set, done_backend=dict):
"""
Crawler constructor
@@ -24,18 +25,35 @@ class Crawler:
:type out_file: str
:param out_format: sitemap type [xml | txt]. Default xml
:type out_format: str
- :param maxtasks: maximum count of tasks. Default 100
+ :param maxtasks: maximum count of tasks. Default 10
:type maxtasks: int
+ :param exclude_urls: excludable url paths relative to root url
+ :type exclude_urls: list
+ :param verifyssl: verify website certificate?
+ :type verifyssl: boolean
+ :param timezone_offset: timezone offset for lastmod tags
+ :type timezone_offset: int
+ :param changefreq: dictionary, where key is site sub url regex, and value is changefreq
+ :type changefreq: dict
+ :param priorities: dictionary, where key is site sub url regex, and value is priority float
+ :type priorities: dict
"""
self.rooturl = rooturl
+ self.exclude_urls = exclude_urls
self.todo_queue = todo_queue_backend()
self.busy = set()
self.done = done_backend()
self.tasks = set()
self.sem = asyncio.Semaphore(maxtasks)
+ self.timezone_offset = timezone_offset
+ self.changefreq = changefreq
+ self.priorities = priorities
# connector stores cookies between requests and uses connection pool
- self.session = aiohttp.ClientSession()
+ self.session = aiohttp.ClientSession(
+ headers=headers,
+ connector=aiohttp.TCPConnector(verify_ssl=verifyssl)
+ )
self.writer = self.format_processors.get(out_format)(out_file)
async def run(self):
@@ -50,7 +68,29 @@ class Crawler:
await t
await self.session.close()
- await self.writer.write([key for key, value in self.done.items() if value])
+ await self.writer.write([(key, value) for key, value in self.done.items() if key and value], self.timezone_offset)
+
+ async def contains(self, url, regex, rlist=True):
+ """
+ Does url path matches a value in regex_list?
+ """
+ retvalue = False
+ if rlist:
+ for exc in regex:
+ retvalue = bool(re.search(re.compile(r"{}".format(exc)), url))
+ if retvalue: return retvalue
+ else:
+ retvalue = bool(re.search(re.compile(r"{}".format(regex)), url))
+ return retvalue
+
+ async def urldict(self, url, url_dict):
+ """
+ Parse URL regex (key) and value pairs
+ """
+ for urlkey, regvalue in url_dict.items():
+ if await self.contains(url, urlkey, rlist=False):
+ return regvalue
+ return None
async def addurls(self, urls):
"""
@@ -61,7 +101,9 @@ class Crawler:
for url, parenturl in urls:
url = urllib.parse.urljoin(parenturl, url)
url, frag = urllib.parse.urldefrag(url)
+
if (url.startswith(self.rooturl) and
+ not await self.contains(url, self.exclude_urls, rlist=True) and
url not in self.busy and
url not in self.done and
url not in self.todo_queue):
@@ -89,26 +131,37 @@ class Crawler:
self.todo_queue.remove(url)
self.busy.add(url)
+ lastmod = None
+ cf = None
+ pr = None
+
try:
resp = await self.session.get(url) # await response
except Exception as exc:
# on any exception mark url as BAD
print('...', url, 'has error', repr(str(exc)))
- self.done[url] = False
+ self.done[url] = [False, lastmod, cf, pr]
else:
# only url with status == 200 and content type == 'text/html' parsed
if (resp.status == 200 and
('text/html' in resp.headers.get('content-type'))):
data = (await resp.read()).decode('utf-8', 'replace')
urls = re.findall(r'(?i)href=["\']?([^\s"\'<>]+)', data)
+ lastmod = resp.headers.get('last-modified')
+
asyncio.Task(self.addurls([(u, url) for u in urls]))
+ try: pr = await self.urldict(url, self.changefreq)
+ except IndexError: pass
+
+ try: cf = await self.urldict(url, self.priorities)
+ except IndexError: pass
+
# even if we have no exception, we can mark url as good
resp.close()
- self.done[url] = True
+
+ self.done[url] = [True, lastmod, cf, pr]
self.busy.remove(url)
logging.info(len(self.done), 'completed tasks,', len(self.tasks),
'still pending, todo_queue', len(self.todo_queue))
-
-
diff --git a/pysitemap/format_processors/xml.py b/pysitemap/format_processors/xml.py
index 9446bb4..300394c 100644
--- a/pysitemap/format_processors/xml.py
+++ b/pysitemap/format_processors/xml.py
@@ -1,14 +1,14 @@
import asyncio
from aiofile import AIOFile, Reader, Writer
import logging
-
+from datetime import datetime, timezone, timedelta
class XMLWriter():
def __init__(self, filename: str):
self.filename = filename
- async def write(self, urls):
+ async def write(self, urls, timezone_offset):
async with AIOFile(self.filename, 'w') as aiodf:
writer = Writer(aiodf)
await writer('\n')
@@ -17,10 +17,26 @@ class XMLWriter():
' xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"'
' xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd">\n')
await aiodf.fsync()
- for url in urls:
- await writer('{}\n'.format(url))
+ for data in urls:
+
+ timestamp = data[1][1]
+ changefreq = data[1][2]
+ priority = data[1][3]
+ url = "{}".format(data[0])
+
+ if timestamp is not None:
+ timestamp = datetime.strptime(timestamp, "%a, %d %b %Y %H:%M:%S %Z").astimezone(tz=timezone(timedelta(hours=timezone_offset))).isoformat()
+ url += "{}".format(str(timestamp))
+
+ if changefreq is not None:
+ url += "{}".format(str(changefreq))
+
+ if priority is not None:
+ url += "{}".format(str(priority))
+
+ await writer('{}\n'.format(url))
+
await aiodf.fsync()
await writer('')
await aiodf.fsync()
-