From e65b8b39f2e37e62fc627cf80cde96dcedd5999f Mon Sep 17 00:00:00 2001
From: Pekka Helenius <fincer89@hotmail.com>
Date: Sun, 3 May 2020 22:00:55 +0300
Subject: [PATCH] Implement image crawler

---
 README.md                          |  6 ++++
 pysitemap/__init__.py              |  5 ++--
 pysitemap/base_crawler.py          | 46 ++++++++++++++++++++++++++----
 pysitemap/format_processors/xml.py |  6 ++++
 4 files changed, 55 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index eb5c88a..3af4cee 100644
--- a/README.md
+++ b/README.md
@@ -59,6 +59,12 @@ if __name__ == '__main__':
             '[a-zA-Z0-9]*\.[a-zA-Z0-9]*$',
             '\?\.php',
         ],
+        exclude_imgs=[
+            'logo\.(png|jpg)',
+            'avatars',
+            'avatar_default',
+            '/symbols/'
+        ],
         headers={'User-Agent': 'Crawler'},
         # TZ offset in hours
         timezone_offset=3,
diff --git a/pysitemap/__init__.py b/pysitemap/__init__.py
index 5a9bb92..9626064 100644
--- a/pysitemap/__init__.py
+++ b/pysitemap/__init__.py
@@ -5,7 +5,7 @@ from pysitemap.base_crawler import Crawler
 
 def crawler(
     root_url, out_file, out_format='xml',
-    maxtasks=10, exclude_urls=[], verifyssl=True,
+    maxtasks=10, exclude_urls=[], exclude_imgs=[], verifyssl=True,
     headers=None, timezone_offset=0, changefreq=None,
     priorities=None):
     """
@@ -15,6 +15,7 @@ def crawler(
     :param out_format: format of out file [xml, txt]
     :param maxtasks: max count of tasks
     :param exclude_urls: excludable url paths
+    :param exclude_imgs: excludable img url paths
     :param verifyssl: verify website certificate?
     :param headers: Send these headers in every request
     :param timezone_offset: timezone offset for lastmod tags
@@ -25,7 +26,7 @@ def crawler(
     loop = asyncio.get_event_loop()
 
     c = Crawler(root_url, out_file=out_file, out_format=out_format,
-                maxtasks=maxtasks, exclude_urls=exclude_urls, verifyssl=verifyssl,
+                maxtasks=maxtasks, exclude_urls=exclude_urls, exclude_imgs=exclude_imgs, verifyssl=verifyssl,
                 headers=headers, timezone_offset=timezone_offset,
                 changefreq=changefreq, priorities=priorities)
 
diff --git a/pysitemap/base_crawler.py b/pysitemap/base_crawler.py
index 991c241..3ce1f88 100644
--- a/pysitemap/base_crawler.py
+++ b/pysitemap/base_crawler.py
@@ -14,8 +14,8 @@ class Crawler:
         'txt': TextWriter
     }
 
-    def __init__(self, rooturl, out_file, out_format='xml', maxtasks=10, exclude_urls=[], verifyssl=True,
-                 headers=None, timezone_offset=0, changefreq=None, priorities=None,
+    def __init__(self, rooturl, out_file, out_format='xml', maxtasks=100, exclude_urls=[], exclude_imgs=[],
+                 verifyssl=True, headers=None, timezone_offset=0, changefreq=None, priorities=None,
                  todo_queue_backend=set, done_backend=dict):
         """
         Crawler constructor
@@ -25,10 +25,12 @@ class Crawler:
         :type out_file: str
         :param out_format: sitemap type [xml | txt]. Default xml
         :type out_format: str
-        :param maxtasks: maximum count of tasks. Default 10
+        :param maxtasks: maximum count of tasks. Default 100
         :type maxtasks: int
         :param exclude_urls: excludable url paths relative to root url
         :type exclude_urls: list
+        :param exclude_imgs: excludable img url paths relative to root url
+        :type exclude_imgs: list
         :param verifyssl: verify website certificate?
         :type verifyssl: boolean
         :param timezone_offset: timezone offset for lastmod tags
@@ -40,6 +42,7 @@ class Crawler:
         """
         self.rooturl = rooturl
         self.exclude_urls = exclude_urls
+        self.exclude_imgs = exclude_imgs
         self.todo_queue = todo_queue_backend()
         self.busy = set()
         self.done = done_backend()
@@ -119,6 +122,36 @@ class Crawler:
                 # Add task into tasks
                 self.tasks.add(task)
 
+    async def addimages(self, data):
+        """
+        Find all images in website data
+        """
+        imgs = []
+        imgs_ok = []
+        lines_tmp = []
+        tag = False
+        for line in data.split('\n'):
+            if re.search(r'<img', line):
+                tag = True
+            if re.search(r'<img', line) and re.search(r'\/>', line):
+                tag = False
+                lines_tmp.append(line)
+                continue
+            if re.search(r'\/>', line) and tag:
+                tag = False
+            if tag:
+                lines_tmp.append(line)
+
+        imgs = re.findall(r'(?i)src=["\']?([^\s\"\'<>]+)', str(lines_tmp))
+
+        for img in imgs:
+            if not await self.contains(img, self.exclude_imgs, rlist=True):
+                if img.startswith(self.rooturl):
+                    imgs_ok.append(img)
+                elif not img.startswith("http"):
+                    imgs_ok.append(re.sub('/$', '', self.rooturl) + img)
+        return imgs_ok
+
     async def process(self, url):
         """
         Process single url
@@ -134,13 +167,14 @@ class Crawler:
         lastmod = None
         cf = None
         pr = None
+        imgs = []
 
         try:
             resp = await self.session.get(url)  # await response
         except Exception as exc:
             # on any exception mark url as BAD
             print('...', url, 'has error', repr(str(exc)))
-            self.done[url] = [False, lastmod, cf, pr]
+            self.done[url] = [False, lastmod, cf, pr, imgs]
         else:
             # only url with status == 200 and content type == 'text/html' parsed
             if (resp.status == 200 and
@@ -148,7 +182,7 @@ class Crawler:
                 data = (await resp.read()).decode('utf-8', 'replace')
                 urls = re.findall(r'(?i)href=["\']?([^\s"\'<>]+)', data)
                 lastmod = resp.headers.get('last-modified')
-
+                imgs = await self.addimages(data)
                 asyncio.Task(self.addurls([(u, url) for u in urls]))
 
                 try: pr = await self.urldict(url, self.changefreq)
@@ -160,7 +194,7 @@ class Crawler:
             # even if we have no exception, we can mark url as good
             resp.close()
 
-            self.done[url] = [True, lastmod, cf, pr]
+            self.done[url] = [True, lastmod, cf, pr, imgs]
 
         self.busy.remove(url)
         logging.info(len(self.done), 'completed tasks,', len(self.tasks),
diff --git a/pysitemap/format_processors/xml.py b/pysitemap/format_processors/xml.py
index 300394c..9eea948 100644
--- a/pysitemap/format_processors/xml.py
+++ b/pysitemap/format_processors/xml.py
@@ -22,6 +22,8 @@ class XMLWriter():
                 timestamp  = data[1][1]
                 changefreq = data[1][2]
                 priority   = data[1][3]
+                images     = data[1][4]
+
                 url = "<loc>{}</loc>".format(data[0])
 
                 if timestamp is not None:
@@ -34,6 +36,10 @@ class XMLWriter():
                 if priority is not None:
                     url += "<priority>{}</priority>".format(str(priority))
 
+                if len(images) > 0:
+                    for image in images:
+                        url += "<image:image><image:loc>{}</image:loc></image:image>".format(str(image))
+
                 await writer('<url>{}</url>\n'.format(url))
 
             await aiodf.fsync()