diff --git a/pysitemap/base_crawler.py b/pysitemap/base_crawler.py
index 2b669c8..cd6b59f 100644
--- a/pysitemap/base_crawler.py
+++ b/pysitemap/base_crawler.py
@@ -141,7 +141,7 @@ class Crawler:
else:
mime = resp.headers.get('content-type')
if (resp.status == 200 and
- bool(re.search(re.compile(r"{}".format(expected)), mime))):
+ bool(re.search(re.compile(r'{}'.format(expected)), mime))):
resp.close()
self.busy.remove(url)
return True
@@ -149,67 +149,107 @@ class Crawler:
self.busy.remove(url)
return False
- async def addimages(self, data, url):
+ async def fetchtags(self, data, url, tag_input, fields=[]):
"""
- Find all images in website data
+ Find and sort all target tags from website data
"""
- imgs = []
- imgs_ok = []
- lines_tmp = []
- tag = False
+ tags = []
+ lines_join = []
for line in data.split('\n'):
- if re.search(r'', line):
- tag = False
- lines_tmp.append(line)
- continue
- if re.search(r'\/>', line) and tag:
- tag = False
- if tag:
- lines_tmp.append(line)
-
- imgs = re.findall(r'(?i)src=["\']?([^\s\"\'<>]+)', str(lines_tmp))
-
- for img in imgs:
- image_url = ""
- if not await self.contains(img, self.exclude_imgs, rlist=True):
-
- if img.startswith(self.rooturl):
- image_url = img
-
- elif not img.startswith("http"):
- for image_root_url in self.image_root_urls:
- if url.startswith(image_root_url):
- image_url = image_root_url + img
- break
-
- if (image_url != "" and
- image_url not in self.done_images and
- image_url not in self.busy and
- image_url not in self.todo_queue):
- self.todo_queue.add(image_url)
- # Acquire semaphore
- await self.sem.acquire()
- # Create async task
- task = asyncio.ensure_future(self.mimechecker(image_url, '^image\/'))
- # Add collback into task to release semaphore
- task.add_done_callback(lambda t: self.sem.release())
- # Callback to remove task from tasks
- task.add_done_callback(self.tasks.remove)
- # Add task into tasks
- self.tasks.add(task)
- try:
- result = await asyncio.wait_for(task, timeout=20)
- if (result):
- imgs_ok.append(image_url)
- except asyncio.TimeoutError:
- print("couldn't add image:", image_url)
- task.cancel()
- pass
-
- self.done_images.extend(imgs_ok)
- return imgs_ok
+ lines_join.append(line)
+
+ tags_raw = re.findall(re.compile(r'<{}.*?>'.format(tag_input)), ' '.join(lines_join))
+
+ for tag_raw in tags_raw:
+ tag_raw = re.sub(re.compile(r'<{}(.*?)>'.format(tag_input)), '\\1', tag_raw)
+
+ # Regex lookahead + lookbehind
+ # Find patterns, where pattern start with "=" and ends with " ="
+ # Include the first pattern, which will be used to determine
+ # value which the pattern holds in it
+
+ # TODO Note: this method is error-prone, since it assumes that...
+ # ... no argument value inside tag has value of "="
+ # If this happens, args regex findall & splitting (below) fails.
+ args_raw = re.findall(r'(?i)(?=[\w]+[=]|[\w\"\'])(.*?)(?=\s[\w]+[=])', tag_raw)
+ tag = []
+ for arg_raw in args_raw:
+ arg = arg_raw.split('=')
+ if len(arg) != 2:
+ print("warning: failure on tag data parsing operation.")
+ continue
+
+ arg_dict = {}
+ key = arg[0]
+ # Remove leading and trailing quote marks from value
+ value = re.sub(r'^["\']?(.*?)["\']?$', '\\1', arg[1])
+
+ for field in fields:
+ if key == field:
+ arg_dict[field] = value
+# else:
+# print("warning: ignoring tag data value:", key)
+
+ if len(arg_dict) == 1:
+ tag.append(arg_dict)
+ tags.append(tag)
+ return tags
+
+ async def addtagdata(self, tagdata, url, source_url_field,
+ mimetype, tag_root_urls=[], excludes=[],
+ done_list=[], this_domain=True):
+ """
+ Validate existence of url in given tagdata
+ :return: dictionary of validated tags (of single type)
+ """
+ tags = []
+ for data in tagdata:
+ for tag in data:
+ if not source_url_field in tag:
+ continue
+ tag_full_url = ""
+ if not await self.contains(tag[source_url_field], excludes, rlist=True):
+
+ if this_domain:
+ if tag[source_url_field].startswith(self.rooturl):
+ tag_full_url = tag[source_url_field]
+
+ elif not tag[source_url_field].startswith('http'):
+ for tag_root_url in tag_root_urls:
+ if url.startswith(tag_root_url):
+ tag_full_url = tag_root_url + tag[source_url_field]
+ break
+ else:
+ if tag[source_url_field].startswith('http'):
+ tag_full_url = tag[source_url_field]
+
+ if (tag_full_url != "" and
+ data not in done_list and
+ tag_full_url not in self.busy and
+ tag_full_url not in self.todo_queue):
+ self.todo_queue.add(tag_full_url)
+ # Acquire semaphore
+ await self.sem.acquire()
+ # Create async task
+ task = asyncio.ensure_future(self.mimechecker(tag_full_url, mimetype))
+ # Add collback into task to release semaphore
+ task.add_done_callback(lambda t: self.sem.release())
+ # Callback to remove task from tasks
+ task.add_done_callback(self.tasks.remove)
+ # Add task into tasks
+ self.tasks.add(task)
+ try:
+ result = await asyncio.wait_for(task, timeout=20)
+ if (result):
+ tags.append(data)
+
+ except asyncio.TimeoutError:
+ print("couldn't add tag data:", tag_full_url)
+ task.cancel()
+ pass
+
+ done_list.extend(tags)
+ return tags
async def process(self, url):
"""
@@ -240,8 +280,23 @@ class Crawler:
('text/html' in resp.headers.get('content-type'))):
data = (await resp.read()).decode('utf-8', 'replace')
urls = re.findall(r'(?i)href=["\']?([^\s"\'<>]+)', data)
+
lastmod = resp.headers.get('last-modified')
- imgs = await self.addimages(data, url)
+
+ # Ref: https://support.google.com/webmasters/answer/178636?hl=en
+ img_data = await self.fetchtags(
+ data, url, 'img',
+ fields=['src', 'title', 'caption', 'geo_location', 'license']
+ )
+ imgs = await self.addtagdata(
+ tagdata=img_data, url=url,
+ source_url_field='src', mimetype='^image\/',
+ tag_root_urls=self.image_root_urls,
+ excludes=self.exclude_imgs,
+ done_list=self.done_images,
+ this_domain=True
+ )
+
asyncio.Task(self.addurls([(u, url) for u in urls]))
try: pr = await self.urldict(url, self.changefreq)
diff --git a/pysitemap/format_processors/xml.py b/pysitemap/format_processors/xml.py
index fe2ea05..1b274a2 100644
--- a/pysitemap/format_processors/xml.py
+++ b/pysitemap/format_processors/xml.py
@@ -23,7 +23,7 @@ class XMLWriter():
timestamp = data[1][1]
changefreq = data[1][2]
priority = data[1][3]
- images = data[1][4]
+ image_data = data[1][4]
url = "{}".format(data[0])
@@ -37,9 +37,17 @@ class XMLWriter():
if priority is not None:
url += "{}".format(str(priority))
- if len(images) > 0:
- for image in images:
- url += "{}".format(str(image))
+ if len(image_data) > 0:
+ for image in image_data:
+ for arg in image:
+ image_xml = ""
+ if 'src' in arg: image_xml += "{}".format(arg['src'])
+ if 'title' in arg: image_xml += "{}".format(arg['title'])
+ if 'caption' in arg: image_xml += "{}".format(arg['caption'])
+ if 'geo_location' in arg: image_xml += "{}".format(arg['geo_location'])
+ if 'license' in arg: image_xml += "{}".format(arg['license'])
+
+ url += "{}".format(image_xml)
await writer('{}\n'.format(url))