Implement mime type checker

5 years ago · 4a93ef1f24
--- a/pysitemap/base_crawler.py
+++ b/pysitemap/base_crawler.py
@ -14,7 +14,7 @@ class Crawler:
        'txt': TextWriter
    }

    def __init__(self, rooturl, out_file, out_format='xml', maxtasks=100, exclude_urls=[], exclude_imgs=[],
    def __init__(self, rooturl, out_file, out_format='xml', maxtasks=10, exclude_urls=[], exclude_imgs=[],
                 verifyssl=True, headers=None, timezone_offset=0, changefreq=None, priorities=None,
                 todo_queue_backend=set, done_backend=dict):
        """
@ -25,7 +25,7 @@ class Crawler:
        :type out_file: str
        :param out_format: sitemap type [xml | txt]. Default xml
        :type out_format: str
        :param maxtasks: maximum count of tasks. Default 100
        :param maxtasks: maximum count of tasks. Default 10
        :type maxtasks: int
        :param exclude_urls: excludable url paths relative to root url
        :type exclude_urls: list
@ -122,6 +122,21 @@ class Crawler:
                # Add task into tasks
                self.tasks.add(task)

    async def mimechecker(self, url, expected):
        """
        Check url resource mimetype
        """
        try:
            resp = await self.session.get(url)
        except Exception as exc:
            pass
        else:
            mime = resp.headers.get('content-type')
            if (resp.status == 200 and
                bool(re.search(re.compile(r"{}".format(expected)), mime))):
                return True
        return False

    async def addimages(self, data):
        """
        Find all images in website data
@ -147,9 +162,11 @@ class Crawler:
        for img in imgs:
            if not await self.contains(img, self.exclude_imgs, rlist=True):
                if img.startswith(self.rooturl):
                    imgs_ok.append(img)
                      if await self.mimechecker(img, '^image\/'):
                          imgs_ok.append(img)
                elif not img.startswith("http"):
                    imgs_ok.append(re.sub('/$', '', self.rooturl) + img)
                      if await self.mimechecker(img, '^image\/'):
                          imgs_ok.append(re.sub('/$', '', self.rooturl) + img)
        return imgs_ok

    async def process(self, url):