From 1d0b3865350e9b36866261dc20114c179f147896 Mon Sep 17 00:00:00 2001
From: Pekka Helenius <fincer89@hotmail.com>
Date: Mon, 4 May 2020 15:45:36 +0300
Subject: [PATCH] Improve image processing; Support more image data; Generalize
 tag data fetching operation

---
 pysitemap/base_crawler.py          | 175 +++++++++++++++++++----------
 pysitemap/format_processors/xml.py |  16 ++-
 2 files changed, 127 insertions(+), 64 deletions(-)
diff --git a/pysitemap/base_crawler.py b/pysitemap/base_crawler.py
index 2b669c8..cd6b59f 100644
--- a/pysitemap/base_crawler.py
+++ b/pysitemap/base_crawler.py
@@ -141,7 +141,7 @@ class Crawler:
         else:
             mime = resp.headers.get('content-type')
             if (resp.status == 200 and
-                bool(re.search(re.compile(r"{}".format(expected)), mime))):
+                bool(re.search(re.compile(r'{}'.format(expected)), mime))):
                 resp.close()
                 self.busy.remove(url)
                 return True
@@ -149,67 +149,107 @@ class Crawler:
         self.busy.remove(url)
         return False
 
-    async def addimages(self, data, url):
+    async def fetchtags(self, data, url, tag_input, fields=[]):
         """
-        Find all images in website data
+        Find and sort all target tags from website data
         """
-        imgs = []
-        imgs_ok = []
-        lines_tmp = []
-        tag = False
+        tags = []
+        lines_join = []
         for line in data.split('\n'):
-            if re.search(r'<img', line):
-                tag = True
-            if re.search(r'<img', line) and re.search(r'\/>', line):
-                tag = False
-                lines_tmp.append(line)
-                continue
-            if re.search(r'\/>', line) and tag:
-                tag = False
-            if tag:
-                lines_tmp.append(line)
-
-        imgs = re.findall(r'(?i)src=["\']?([^\s\"\'<>]+)', str(lines_tmp))
-
-        for img in imgs:
-            image_url = ""
-            if not await self.contains(img, self.exclude_imgs, rlist=True):
-
-                if img.startswith(self.rooturl):
-                    image_url = img
-
-                elif not img.startswith("http"):
-                    for image_root_url in self.image_root_urls:
-                        if url.startswith(image_root_url):
-                            image_url = image_root_url + img
-                            break
-
-                if (image_url != "" and
-                    image_url not in self.done_images and
-                    image_url not in self.busy and
-                    image_url not in self.todo_queue):
-                    self.todo_queue.add(image_url)
-                    # Acquire semaphore
-                    await self.sem.acquire()
-                    # Create async task
-                    task = asyncio.ensure_future(self.mimechecker(image_url, '^image\/'))
-                    # Add collback into task to release semaphore
-                    task.add_done_callback(lambda t: self.sem.release())
-                    # Callback to remove task from tasks
-                    task.add_done_callback(self.tasks.remove)
-                    # Add task into tasks
-                    self.tasks.add(task)
-                    try:
-                        result = await asyncio.wait_for(task, timeout=20)
-                        if (result):
-                            imgs_ok.append(image_url)
-                    except asyncio.TimeoutError:
-                        print("couldn't add image:", image_url)
-                        task.cancel()
-                        pass
-
-        self.done_images.extend(imgs_ok)
-        return imgs_ok
+            lines_join.append(line)
+
+        tags_raw = re.findall(re.compile(r'<{}.*?>'.format(tag_input)), ' '.join(lines_join))
+
+        for tag_raw in tags_raw:
+            tag_raw = re.sub(re.compile(r'<{}(.*?)>'.format(tag_input)), '\\1', tag_raw)
+
+            # Regex lookahead + lookbehind
+            # Find patterns, where pattern start with "<word>=" and ends with " <word>="
+            # Include the first pattern, which will be used to determine
+            # value which the pattern holds in it
+
+            # TODO Note: this method is error-prone, since it assumes that...
+            #  ... no argument value inside <img ... /> tag has value of "<somechar>="
+            #  If this happens, args regex findall & splitting (below) fails.
+            args_raw = re.findall(r'(?i)(?=[\w]+[=]|[\w\"\'])(.*?)(?=\s[\w]+[=])', tag_raw)
+            tag = []
+            for arg_raw in args_raw:
+                arg = arg_raw.split('=')
+                if len(arg) != 2:
+                    print("warning: failure on tag data parsing operation.")
+                    continue
+
+                arg_dict = {}
+                key = arg[0]
+                # Remove leading and trailing quote marks from value
+                value = re.sub(r'^["\']?(.*?)["\']?$', '\\1', arg[1])
+
+                for field in fields:
+                    if key == field:
+                        arg_dict[field] = value
+#                    else:
+#                        print("warning: ignoring tag data value:", key)
+
+                if len(arg_dict) == 1:
+                    tag.append(arg_dict)
+            tags.append(tag)
+        return tags
+
+    async def addtagdata(self, tagdata, url, source_url_field,
+                            mimetype, tag_root_urls=[], excludes=[],
+                            done_list=[], this_domain=True):
+        """
+        Validate existence of url in given tagdata
+        :return: dictionary of validated tags (of single type)
+        """
+        tags = []
+        for data in tagdata:
+            for tag in data:
+                if not source_url_field in tag:
+                    continue
+                tag_full_url = ""
+                if not await self.contains(tag[source_url_field], excludes, rlist=True):
+
+                    if this_domain:
+                        if tag[source_url_field].startswith(self.rooturl):
+                            tag_full_url = tag[source_url_field]
+
+                        elif not tag[source_url_field].startswith('http'):
+                            for tag_root_url in tag_root_urls:
+                                if url.startswith(tag_root_url):
+                                    tag_full_url = tag_root_url + tag[source_url_field]
+                                    break
+                    else:
+                        if tag[source_url_field].startswith('http'):
+                            tag_full_url = tag[source_url_field]
+
+                    if (tag_full_url != "" and
+                        data not in done_list and
+                        tag_full_url not in self.busy and
+                        tag_full_url not in self.todo_queue):
+                        self.todo_queue.add(tag_full_url)
+                        # Acquire semaphore
+                        await self.sem.acquire()
+                        # Create async task
+                        task = asyncio.ensure_future(self.mimechecker(tag_full_url, mimetype))
+                        # Add collback into task to release semaphore
+                        task.add_done_callback(lambda t: self.sem.release())
+                        # Callback to remove task from tasks
+                        task.add_done_callback(self.tasks.remove)
+                        # Add task into tasks
+                        self.tasks.add(task)
+                        try:
+                            result = await asyncio.wait_for(task, timeout=20)
+                            if (result):
+                                tags.append(data)
+
+                        except asyncio.TimeoutError:
+                            print("couldn't add tag data:", tag_full_url)
+                            task.cancel()
+                            pass
+
+            done_list.extend(tags)
+        return tags
 
     async def process(self, url):
         """
@@ -240,8 +280,23 @@ class Crawler:
                     ('text/html' in resp.headers.get('content-type'))):
                 data = (await resp.read()).decode('utf-8', 'replace')
                 urls = re.findall(r'(?i)href=["\']?([^\s"\'<>]+)', data)
+
                 lastmod = resp.headers.get('last-modified')
-                imgs = await self.addimages(data, url)
+
+                # Ref: https://support.google.com/webmasters/answer/178636?hl=en
+                img_data = await self.fetchtags(
+                            data, url, 'img',
+                            fields=['src', 'title', 'caption', 'geo_location', 'license']
+                )
+                imgs = await self.addtagdata(
+                        tagdata=img_data, url=url,
+                        source_url_field='src', mimetype='^image\/',
+                        tag_root_urls=self.image_root_urls,
+                        excludes=self.exclude_imgs,
+                        done_list=self.done_images,
+                        this_domain=True
+                )
+
                 asyncio.Task(self.addurls([(u, url) for u in urls]))
 
                 try: pr = await self.urldict(url, self.changefreq)
diff --git a/pysitemap/format_processors/xml.py b/pysitemap/format_processors/xml.py
index fe2ea05..1b274a2 100644
--- a/pysitemap/format_processors/xml.py
+++ b/pysitemap/format_processors/xml.py
@@ -23,7 +23,7 @@ class XMLWriter():
                 timestamp  = data[1][1]
                 changefreq = data[1][2]
                 priority   = data[1][3]
-                images     = data[1][4]
+                image_data = data[1][4]
 
                 url = "<loc>{}</loc>".format(data[0])
 
@@ -37,9 +37,17 @@ class XMLWriter():
                 if priority is not None:
                     url += "<priority>{}</priority>".format(str(priority))
 
-                if len(images) > 0:
-                    for image in images:
-                        url += "<image:image><image:loc>{}</image:loc></image:image>".format(str(image))
+                if len(image_data) > 0:
+                    for image in image_data:
+                        for arg in image:
+                            image_xml = ""
+                            if 'src' in arg:          image_xml += "<image:loc>{}</image:loc>".format(arg['src'])
+                            if 'title' in arg:        image_xml += "<image:title>{}</image:title>".format(arg['title'])
+                            if 'caption' in arg:      image_xml += "<image:caption>{}</image:caption>".format(arg['caption'])
+                            if 'geo_location' in arg: image_xml += "<image:geo_location>{}</image:geo_location>".format(arg['geo_location'])
+                            if 'license' in arg:      image_xml += "<image:license>{}</image:license>".format(arg['license'])
+
+                            url += "<image:image>{}</image:image>".format(image_xml)
 
                 await writer('<url>{}</url>\n'.format(url))