From 1d0b3865350e9b36866261dc20114c179f147896 Mon Sep 17 00:00:00 2001 From: Pekka Helenius Date: Mon, 4 May 2020 15:45:36 +0300 Subject: [PATCH] Improve image processing; Support more image data; Generalize tag data fetching operation --- pysitemap/base_crawler.py | 175 +++++++++++++++++++---------- pysitemap/format_processors/xml.py | 16 ++- 2 files changed, 127 insertions(+), 64 deletions(-) diff --git a/pysitemap/base_crawler.py b/pysitemap/base_crawler.py index 2b669c8..cd6b59f 100644 --- a/pysitemap/base_crawler.py +++ b/pysitemap/base_crawler.py @@ -141,7 +141,7 @@ class Crawler: else: mime = resp.headers.get('content-type') if (resp.status == 200 and - bool(re.search(re.compile(r"{}".format(expected)), mime))): + bool(re.search(re.compile(r'{}'.format(expected)), mime))): resp.close() self.busy.remove(url) return True @@ -149,67 +149,107 @@ class Crawler: self.busy.remove(url) return False - async def addimages(self, data, url): + async def fetchtags(self, data, url, tag_input, fields=[]): """ - Find all images in website data + Find and sort all target tags from website data """ - imgs = [] - imgs_ok = [] - lines_tmp = [] - tag = False + tags = [] + lines_join = [] for line in data.split('\n'): - if re.search(r'', line): - tag = False - lines_tmp.append(line) - continue - if re.search(r'\/>', line) and tag: - tag = False - if tag: - lines_tmp.append(line) - - imgs = re.findall(r'(?i)src=["\']?([^\s\"\'<>]+)', str(lines_tmp)) - - for img in imgs: - image_url = "" - if not await self.contains(img, self.exclude_imgs, rlist=True): - - if img.startswith(self.rooturl): - image_url = img - - elif not img.startswith("http"): - for image_root_url in self.image_root_urls: - if url.startswith(image_root_url): - image_url = image_root_url + img - break - - if (image_url != "" and - image_url not in self.done_images and - image_url not in self.busy and - image_url not in self.todo_queue): - self.todo_queue.add(image_url) - # Acquire semaphore - await self.sem.acquire() - # Create async task - task = asyncio.ensure_future(self.mimechecker(image_url, '^image\/')) - # Add collback into task to release semaphore - task.add_done_callback(lambda t: self.sem.release()) - # Callback to remove task from tasks - task.add_done_callback(self.tasks.remove) - # Add task into tasks - self.tasks.add(task) - try: - result = await asyncio.wait_for(task, timeout=20) - if (result): - imgs_ok.append(image_url) - except asyncio.TimeoutError: - print("couldn't add image:", image_url) - task.cancel() - pass - - self.done_images.extend(imgs_ok) - return imgs_ok + lines_join.append(line) + + tags_raw = re.findall(re.compile(r'<{}.*?>'.format(tag_input)), ' '.join(lines_join)) + + for tag_raw in tags_raw: + tag_raw = re.sub(re.compile(r'<{}(.*?)>'.format(tag_input)), '\\1', tag_raw) + + # Regex lookahead + lookbehind + # Find patterns, where pattern start with "=" and ends with " =" + # Include the first pattern, which will be used to determine + # value which the pattern holds in it + + # TODO Note: this method is error-prone, since it assumes that... + # ... no argument value inside tag has value of "=" + # If this happens, args regex findall & splitting (below) fails. + args_raw = re.findall(r'(?i)(?=[\w]+[=]|[\w\"\'])(.*?)(?=\s[\w]+[=])', tag_raw) + tag = [] + for arg_raw in args_raw: + arg = arg_raw.split('=') + if len(arg) != 2: + print("warning: failure on tag data parsing operation.") + continue + + arg_dict = {} + key = arg[0] + # Remove leading and trailing quote marks from value + value = re.sub(r'^["\']?(.*?)["\']?$', '\\1', arg[1]) + + for field in fields: + if key == field: + arg_dict[field] = value +# else: +# print("warning: ignoring tag data value:", key) + + if len(arg_dict) == 1: + tag.append(arg_dict) + tags.append(tag) + return tags + + async def addtagdata(self, tagdata, url, source_url_field, + mimetype, tag_root_urls=[], excludes=[], + done_list=[], this_domain=True): + """ + Validate existence of url in given tagdata + :return: dictionary of validated tags (of single type) + """ + tags = [] + for data in tagdata: + for tag in data: + if not source_url_field in tag: + continue + tag_full_url = "" + if not await self.contains(tag[source_url_field], excludes, rlist=True): + + if this_domain: + if tag[source_url_field].startswith(self.rooturl): + tag_full_url = tag[source_url_field] + + elif not tag[source_url_field].startswith('http'): + for tag_root_url in tag_root_urls: + if url.startswith(tag_root_url): + tag_full_url = tag_root_url + tag[source_url_field] + break + else: + if tag[source_url_field].startswith('http'): + tag_full_url = tag[source_url_field] + + if (tag_full_url != "" and + data not in done_list and + tag_full_url not in self.busy and + tag_full_url not in self.todo_queue): + self.todo_queue.add(tag_full_url) + # Acquire semaphore + await self.sem.acquire() + # Create async task + task = asyncio.ensure_future(self.mimechecker(tag_full_url, mimetype)) + # Add collback into task to release semaphore + task.add_done_callback(lambda t: self.sem.release()) + # Callback to remove task from tasks + task.add_done_callback(self.tasks.remove) + # Add task into tasks + self.tasks.add(task) + try: + result = await asyncio.wait_for(task, timeout=20) + if (result): + tags.append(data) + + except asyncio.TimeoutError: + print("couldn't add tag data:", tag_full_url) + task.cancel() + pass + + done_list.extend(tags) + return tags async def process(self, url): """ @@ -240,8 +280,23 @@ class Crawler: ('text/html' in resp.headers.get('content-type'))): data = (await resp.read()).decode('utf-8', 'replace') urls = re.findall(r'(?i)href=["\']?([^\s"\'<>]+)', data) + lastmod = resp.headers.get('last-modified') - imgs = await self.addimages(data, url) + + # Ref: https://support.google.com/webmasters/answer/178636?hl=en + img_data = await self.fetchtags( + data, url, 'img', + fields=['src', 'title', 'caption', 'geo_location', 'license'] + ) + imgs = await self.addtagdata( + tagdata=img_data, url=url, + source_url_field='src', mimetype='^image\/', + tag_root_urls=self.image_root_urls, + excludes=self.exclude_imgs, + done_list=self.done_images, + this_domain=True + ) + asyncio.Task(self.addurls([(u, url) for u in urls])) try: pr = await self.urldict(url, self.changefreq) diff --git a/pysitemap/format_processors/xml.py b/pysitemap/format_processors/xml.py index fe2ea05..1b274a2 100644 --- a/pysitemap/format_processors/xml.py +++ b/pysitemap/format_processors/xml.py @@ -23,7 +23,7 @@ class XMLWriter(): timestamp = data[1][1] changefreq = data[1][2] priority = data[1][3] - images = data[1][4] + image_data = data[1][4] url = "{}".format(data[0]) @@ -37,9 +37,17 @@ class XMLWriter(): if priority is not None: url += "{}".format(str(priority)) - if len(images) > 0: - for image in images: - url += "{}".format(str(image)) + if len(image_data) > 0: + for image in image_data: + for arg in image: + image_xml = "" + if 'src' in arg: image_xml += "{}".format(arg['src']) + if 'title' in arg: image_xml += "{}".format(arg['title']) + if 'caption' in arg: image_xml += "{}".format(arg['caption']) + if 'geo_location' in arg: image_xml += "{}".format(arg['geo_location']) + if 'license' in arg: image_xml += "{}".format(arg['license']) + + url += "{}".format(image_xml) await writer('{}\n'.format(url))