diff --git a/README.md b/README.md
index eb5c88a..3af4cee 100644
--- a/README.md
+++ b/README.md
@@ -59,6 +59,12 @@ if __name__ == '__main__':
'[a-zA-Z0-9]*\.[a-zA-Z0-9]*$',
'\?\.php',
],
+ exclude_imgs=[
+ 'logo\.(png|jpg)',
+ 'avatars',
+ 'avatar_default',
+ '/symbols/'
+ ],
headers={'User-Agent': 'Crawler'},
# TZ offset in hours
timezone_offset=3,
diff --git a/pysitemap/__init__.py b/pysitemap/__init__.py
index 5a9bb92..9626064 100644
--- a/pysitemap/__init__.py
+++ b/pysitemap/__init__.py
@@ -5,7 +5,7 @@ from pysitemap.base_crawler import Crawler
def crawler(
root_url, out_file, out_format='xml',
- maxtasks=10, exclude_urls=[], verifyssl=True,
+ maxtasks=10, exclude_urls=[], exclude_imgs=[], verifyssl=True,
headers=None, timezone_offset=0, changefreq=None,
priorities=None):
"""
@@ -15,6 +15,7 @@ def crawler(
:param out_format: format of out file [xml, txt]
:param maxtasks: max count of tasks
:param exclude_urls: excludable url paths
+ :param exclude_imgs: excludable img url paths
:param verifyssl: verify website certificate?
:param headers: Send these headers in every request
:param timezone_offset: timezone offset for lastmod tags
@@ -25,7 +26,7 @@ def crawler(
loop = asyncio.get_event_loop()
c = Crawler(root_url, out_file=out_file, out_format=out_format,
- maxtasks=maxtasks, exclude_urls=exclude_urls, verifyssl=verifyssl,
+ maxtasks=maxtasks, exclude_urls=exclude_urls, exclude_imgs=exclude_imgs, verifyssl=verifyssl,
headers=headers, timezone_offset=timezone_offset,
changefreq=changefreq, priorities=priorities)
diff --git a/pysitemap/base_crawler.py b/pysitemap/base_crawler.py
index 991c241..3ce1f88 100644
--- a/pysitemap/base_crawler.py
+++ b/pysitemap/base_crawler.py
@@ -14,8 +14,8 @@ class Crawler:
'txt': TextWriter
}
- def __init__(self, rooturl, out_file, out_format='xml', maxtasks=10, exclude_urls=[], verifyssl=True,
- headers=None, timezone_offset=0, changefreq=None, priorities=None,
+ def __init__(self, rooturl, out_file, out_format='xml', maxtasks=100, exclude_urls=[], exclude_imgs=[],
+ verifyssl=True, headers=None, timezone_offset=0, changefreq=None, priorities=None,
todo_queue_backend=set, done_backend=dict):
"""
Crawler constructor
@@ -25,10 +25,12 @@ class Crawler:
:type out_file: str
:param out_format: sitemap type [xml | txt]. Default xml
:type out_format: str
- :param maxtasks: maximum count of tasks. Default 10
+ :param maxtasks: maximum count of tasks. Default 100
:type maxtasks: int
:param exclude_urls: excludable url paths relative to root url
:type exclude_urls: list
+ :param exclude_imgs: excludable img url paths relative to root url
+ :type exclude_imgs: list
:param verifyssl: verify website certificate?
:type verifyssl: boolean
:param timezone_offset: timezone offset for lastmod tags
@@ -40,6 +42,7 @@ class Crawler:
"""
self.rooturl = rooturl
self.exclude_urls = exclude_urls
+ self.exclude_imgs = exclude_imgs
self.todo_queue = todo_queue_backend()
self.busy = set()
self.done = done_backend()
@@ -119,6 +122,36 @@ class Crawler:
# Add task into tasks
self.tasks.add(task)
+ async def addimages(self, data):
+ """
+ Find all images in website data
+ """
+ imgs = []
+ imgs_ok = []
+ lines_tmp = []
+ tag = False
+ for line in data.split('\n'):
+ if re.search(r'', line):
+ tag = False
+ lines_tmp.append(line)
+ continue
+ if re.search(r'\/>', line) and tag:
+ tag = False
+ if tag:
+ lines_tmp.append(line)
+
+ imgs = re.findall(r'(?i)src=["\']?([^\s\"\'<>]+)', str(lines_tmp))
+
+ for img in imgs:
+ if not await self.contains(img, self.exclude_imgs, rlist=True):
+ if img.startswith(self.rooturl):
+ imgs_ok.append(img)
+ elif not img.startswith("http"):
+ imgs_ok.append(re.sub('/$', '', self.rooturl) + img)
+ return imgs_ok
+
async def process(self, url):
"""
Process single url
@@ -134,13 +167,14 @@ class Crawler:
lastmod = None
cf = None
pr = None
+ imgs = []
try:
resp = await self.session.get(url) # await response
except Exception as exc:
# on any exception mark url as BAD
print('...', url, 'has error', repr(str(exc)))
- self.done[url] = [False, lastmod, cf, pr]
+ self.done[url] = [False, lastmod, cf, pr, imgs]
else:
# only url with status == 200 and content type == 'text/html' parsed
if (resp.status == 200 and
@@ -148,7 +182,7 @@ class Crawler:
data = (await resp.read()).decode('utf-8', 'replace')
urls = re.findall(r'(?i)href=["\']?([^\s"\'<>]+)', data)
lastmod = resp.headers.get('last-modified')
-
+ imgs = await self.addimages(data)
asyncio.Task(self.addurls([(u, url) for u in urls]))
try: pr = await self.urldict(url, self.changefreq)
@@ -160,7 +194,7 @@ class Crawler:
# even if we have no exception, we can mark url as good
resp.close()
- self.done[url] = [True, lastmod, cf, pr]
+ self.done[url] = [True, lastmod, cf, pr, imgs]
self.busy.remove(url)
logging.info(len(self.done), 'completed tasks,', len(self.tasks),
diff --git a/pysitemap/format_processors/xml.py b/pysitemap/format_processors/xml.py
index 300394c..9eea948 100644
--- a/pysitemap/format_processors/xml.py
+++ b/pysitemap/format_processors/xml.py
@@ -22,6 +22,8 @@ class XMLWriter():
timestamp = data[1][1]
changefreq = data[1][2]
priority = data[1][3]
+ images = data[1][4]
+
url = "{}".format(data[0])
if timestamp is not None:
@@ -34,6 +36,10 @@ class XMLWriter():
if priority is not None:
url += "{}".format(str(priority))
+ if len(images) > 0:
+ for image in images:
+ url += "{}".format(str(image))
+
await writer('{}\n'.format(url))
await aiodf.fsync()