Implement image crawler

4 years ago · e65b8b39f2
--- a/README.md
+++ b/README.md
@ -59,6 +59,12 @@ if __name__ == '__main__':
            '[a-zA-Z0-9]*\.[a-zA-Z0-9]*$',
            '\?\.php',
        ],
        exclude_imgs=[
            'logo\.(png|jpg)',
            'avatars',
            'avatar_default',
            '/symbols/'
        ],
        headers={'User-Agent': 'Crawler'},
        # TZ offset in hours
        timezone_offset=3,
--- a/pysitemap/init.py
+++ b/pysitemap/init.py
@ -5,7 +5,7 @@ from pysitemap.base_crawler import Crawler

 def crawler(
    root_url, out_file, out_format='xml',
    maxtasks=10, exclude_urls=[], verifyssl=True,
    maxtasks=10, exclude_urls=[], exclude_imgs=[], verifyssl=True,
    headers=None, timezone_offset=0, changefreq=None,
    priorities=None):
    """
@ -15,6 +15,7 @@ def crawler(
    :param out_format: format of out file [xml, txt]
    :param maxtasks: max count of tasks
    :param exclude_urls: excludable url paths
    :param exclude_imgs: excludable img url paths
    :param verifyssl: verify website certificate?
    :param headers: Send these headers in every request
    :param timezone_offset: timezone offset for lastmod tags
@ -25,7 +26,7 @@ def crawler(
    loop = asyncio.get_event_loop()

    c = Crawler(root_url, out_file=out_file, out_format=out_format,
                maxtasks=maxtasks, exclude_urls=exclude_urls, verifyssl=verifyssl,
                maxtasks=maxtasks, exclude_urls=exclude_urls, exclude_imgs=exclude_imgs, verifyssl=verifyssl,
                headers=headers, timezone_offset=timezone_offset,
                changefreq=changefreq, priorities=priorities)

--- a/pysitemap/base_crawler.py
+++ b/pysitemap/base_crawler.py
@ -14,8 +14,8 @@ class Crawler:
        'txt': TextWriter
    }

    def __init__(self, rooturl, out_file, out_format='xml', maxtasks=10, exclude_urls=[], verifyssl=True,
                 headers=None, timezone_offset=0, changefreq=None, priorities=None,
    def __init__(self, rooturl, out_file, out_format='xml', maxtasks=100, exclude_urls=[], exclude_imgs=[],
                 verifyssl=True, headers=None, timezone_offset=0, changefreq=None, priorities=None,
                 todo_queue_backend=set, done_backend=dict):
        """
        Crawler constructor
@ -25,10 +25,12 @@ class Crawler:
        :type out_file: str
        :param out_format: sitemap type [xml | txt]. Default xml
        :type out_format: str
        :param maxtasks: maximum count of tasks. Default 10
        :param maxtasks: maximum count of tasks. Default 100
        :type maxtasks: int
        :param exclude_urls: excludable url paths relative to root url
        :type exclude_urls: list
        :param exclude_imgs: excludable img url paths relative to root url
        :type exclude_imgs: list
        :param verifyssl: verify website certificate?
        :type verifyssl: boolean
        :param timezone_offset: timezone offset for lastmod tags
@ -40,6 +42,7 @@ class Crawler:
        """
        self.rooturl = rooturl
        self.exclude_urls = exclude_urls
        self.exclude_imgs = exclude_imgs
        self.todo_queue = todo_queue_backend()
        self.busy = set()
        self.done = done_backend()
@ -119,6 +122,36 @@ class Crawler:
                # Add task into tasks
                self.tasks.add(task)

    async def addimages(self, data):
        """
        Find all images in website data
        """
        imgs = []
        imgs_ok = []
        lines_tmp = []
        tag = False
        for line in data.split('\n'):
            if re.search(r'<img', line):
                tag = True
            if re.search(r'<img', line) and re.search(r'\/>', line):
                tag = False
                lines_tmp.append(line)
                continue
            if re.search(r'\/>', line) and tag:
                tag = False
            if tag:
                lines_tmp.append(line)

        imgs = re.findall(r'(?i)src=["\']?([^\s\"\'<>]+)', str(lines_tmp))

        for img in imgs:
            if not await self.contains(img, self.exclude_imgs, rlist=True):
                if img.startswith(self.rooturl):
                    imgs_ok.append(img)
                elif not img.startswith("http"):
                    imgs_ok.append(re.sub('/$', '', self.rooturl) + img)
        return imgs_ok

    async def process(self, url):
        """
        Process single url
@ -134,13 +167,14 @@ class Crawler:
        lastmod = None
        cf = None
        pr = None
        imgs = []

        try:
            resp = await self.session.get(url)  # await response
        except Exception as exc:
            # on any exception mark url as BAD
            print('...', url, 'has error', repr(str(exc)))
            self.done[url] = [False, lastmod, cf, pr]
            self.done[url] = [False, lastmod, cf, pr, imgs]
        else:
            # only url with status == 200 and content type == 'text/html' parsed
            if (resp.status == 200 and
@ -148,7 +182,7 @@ class Crawler:
                data = (await resp.read()).decode('utf-8', 'replace')
                urls = re.findall(r'(?i)href=["\']?([^\s"\'<>]+)', data)
                lastmod = resp.headers.get('last-modified')

                imgs = await self.addimages(data)
                asyncio.Task(self.addurls([(u, url) for u in urls]))

                try: pr = await self.urldict(url, self.changefreq)
@ -160,7 +194,7 @@ class Crawler:
            # even if we have no exception, we can mark url as good
            resp.close()

            self.done[url] = [True, lastmod, cf, pr]
            self.done[url] = [True, lastmod, cf, pr, imgs]

        self.busy.remove(url)
        logging.info(len(self.done), 'completed tasks,', len(self.tasks),
--- a/pysitemap/format_processors/xml.py
+++ b/pysitemap/format_processors/xml.py
@ -22,6 +22,8 @@ class XMLWriter():
                timestamp  = data[1][1]
                changefreq = data[1][2]
                priority   = data[1][3]
                images     = data[1][4]

                url = "<loc>{}</loc>".format(data[0])

                if timestamp is not None:
@ -34,6 +36,10 @@ class XMLWriter():
                if priority is not None:
                    url += "<priority>{}</priority>".format(str(priority))

                if len(images) > 0:
                    for image in images:
                        url += "<image:image><image:loc>{}</image:loc></image:image>".format(str(image))

                await writer('<url>{}</url>\n'.format(url))

            await aiodf.fsync()