From 4a93ef1f243ef4683c34a147d72d9ad8b9dbb41c Mon Sep 17 00:00:00 2001 From: Pekka Helenius Date: Sun, 3 May 2020 22:36:27 +0300 Subject: [PATCH] Implement mime type checker --- pysitemap/base_crawler.py | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/pysitemap/base_crawler.py b/pysitemap/base_crawler.py index 3ce1f88..5a3fe36 100644 --- a/pysitemap/base_crawler.py +++ b/pysitemap/base_crawler.py @@ -14,7 +14,7 @@ class Crawler: 'txt': TextWriter } - def __init__(self, rooturl, out_file, out_format='xml', maxtasks=100, exclude_urls=[], exclude_imgs=[], + def __init__(self, rooturl, out_file, out_format='xml', maxtasks=10, exclude_urls=[], exclude_imgs=[], verifyssl=True, headers=None, timezone_offset=0, changefreq=None, priorities=None, todo_queue_backend=set, done_backend=dict): """ @@ -25,7 +25,7 @@ class Crawler: :type out_file: str :param out_format: sitemap type [xml | txt]. Default xml :type out_format: str - :param maxtasks: maximum count of tasks. Default 100 + :param maxtasks: maximum count of tasks. Default 10 :type maxtasks: int :param exclude_urls: excludable url paths relative to root url :type exclude_urls: list @@ -122,6 +122,21 @@ class Crawler: # Add task into tasks self.tasks.add(task) + async def mimechecker(self, url, expected): + """ + Check url resource mimetype + """ + try: + resp = await self.session.get(url) + except Exception as exc: + pass + else: + mime = resp.headers.get('content-type') + if (resp.status == 200 and + bool(re.search(re.compile(r"{}".format(expected)), mime))): + return True + return False + async def addimages(self, data): """ Find all images in website data @@ -147,9 +162,11 @@ class Crawler: for img in imgs: if not await self.contains(img, self.exclude_imgs, rlist=True): if img.startswith(self.rooturl): - imgs_ok.append(img) + if await self.mimechecker(img, '^image\/'): + imgs_ok.append(img) elif not img.startswith("http"): - imgs_ok.append(re.sub('/$', '', self.rooturl) + img) + if await self.mimechecker(img, '^image\/'): + imgs_ok.append(re.sub('/$', '', self.rooturl) + img) return imgs_ok async def process(self, url):