diff --git a/pysitemap/base_crawler.py b/pysitemap/base_crawler.py
index def9fbc..ceabea7 100644
--- a/pysitemap/base_crawler.py
+++ b/pysitemap/base_crawler.py
@@ -197,8 +197,7 @@ class Crawler:
# print("warning: ignoring tag data value:", key)
if len(arg_dict) == 1:
- tag.append(arg_dict)
- tags.append(tag)
+ tags.append(arg_dict)
return tags
async def addtagdata(self, tagdata, url, source_url_field,
@@ -209,48 +208,47 @@ class Crawler:
:return: dictionary of validated tags (of single type)
"""
tags = []
- for data in tagdata:
- for tag in data:
- if not source_url_field in tag:
- continue
- if not await self.contains(tag[source_url_field], excludes, rlist=True):
-
- if this_domain:
- if not tag[source_url_field].startswith('http'):
- for tag_root_url in tag_root_urls:
- if url.startswith(tag_root_url):
- tag[source_url_field] = tag_root_url + tag[source_url_field]
- break
- else:
- if not tag[source_url_field].startswith('http'):
- continue
-
- if (tag[source_url_field].startswith('http') and
- data not in done_list and
- tag[source_url_field] not in self.busy and
- tag[source_url_field] not in self.todo_queue):
- self.todo_queue.add(tag[source_url_field])
- # Acquire semaphore
- await self.sem.acquire()
- # Create async task
- task = asyncio.ensure_future(self.mimechecker(tag[source_url_field], mimetype))
- # Add collback into task to release semaphore
- task.add_done_callback(lambda t: self.sem.release())
- # Callback to remove task from tasks
- task.add_done_callback(self.tasks.remove)
- # Add task into tasks
- self.tasks.add(task)
- try:
- result = await asyncio.wait_for(task, timeout=20)
- if (result):
- tags.append(data)
-
- except asyncio.TimeoutError:
- print("couldn't add tag data:", tag[source_url_field])
- task.cancel()
- pass
-
- done_list.extend(tags)
+ for tag in tagdata:
+ if not source_url_field in tag:
+ continue
+ if not await self.contains(tag[source_url_field], excludes, rlist=True):
+
+ if this_domain:
+ if not tag[source_url_field].startswith('http'):
+ for tag_root_url in tag_root_urls:
+ if url.startswith(tag_root_url):
+ tag[source_url_field] = tag_root_url + tag[source_url_field]
+ break
+ else:
+ if not tag[source_url_field].startswith('http'):
+ continue
+
+ if (tag[source_url_field].startswith('http') and
+ tag not in done_list and
+ tag[source_url_field] not in self.busy and
+ tag[source_url_field] not in self.todo_queue):
+ self.todo_queue.add(tag[source_url_field])
+ # Acquire semaphore
+ await self.sem.acquire()
+ # Create async task
+ task = asyncio.ensure_future(self.mimechecker(tag[source_url_field], mimetype))
+ # Add collback into task to release semaphore
+ task.add_done_callback(lambda t: self.sem.release())
+ # Callback to remove task from tasks
+ task.add_done_callback(self.tasks.remove)
+ # Add task into tasks
+ self.tasks.add(task)
+ try:
+ result = await asyncio.wait_for(task, timeout=20)
+ if (result):
+ tags.append(tag)
+
+ except asyncio.TimeoutError:
+ print("couldn't add tag data:", tag[source_url_field])
+ task.cancel()
+ pass
+
+ done_list.extend(tags)
return tags
async def process(self, url):
diff --git a/pysitemap/format_processors/xml.py b/pysitemap/format_processors/xml.py
index 1b274a2..b8fb3ba 100644
--- a/pysitemap/format_processors/xml.py
+++ b/pysitemap/format_processors/xml.py
@@ -39,15 +39,14 @@ class XMLWriter():
if len(image_data) > 0:
for image in image_data:
- for arg in image:
- image_xml = ""
- if 'src' in arg: image_xml += "{}".format(arg['src'])
- if 'title' in arg: image_xml += "{}".format(arg['title'])
- if 'caption' in arg: image_xml += "{}".format(arg['caption'])
- if 'geo_location' in arg: image_xml += "{}".format(arg['geo_location'])
- if 'license' in arg: image_xml += "{}".format(arg['license'])
-
- url += "{}".format(image_xml)
+ image_xml = ""
+ if 'src' in image: image_xml += "{}".format(image['src'])
+ if 'title' in image: image_xml += "{}".format(image['title'])
+ if 'caption' in image: image_xml += "{}".format(image['caption'])
+ if 'geo_location' in image: image_xml += "{}".format(image['geo_location'])
+ if 'license' in image: image_xml += "{}".format(image['license'])
+
+ url += "{}".format(image_xml)
await writer('{}\n'.format(url))