From f4512b36da1fc9f461745c1f43e2dba10161ad8d Mon Sep 17 00:00:00 2001 From: Pekka Helenius Date: Mon, 4 May 2020 22:33:07 +0300 Subject: [PATCH] Remove unnecessary list structure --- pysitemap/base_crawler.py | 86 +++++++++++++++--------------- pysitemap/format_processors/xml.py | 17 +++--- 2 files changed, 50 insertions(+), 53 deletions(-) diff --git a/pysitemap/base_crawler.py b/pysitemap/base_crawler.py index def9fbc..ceabea7 100644 --- a/pysitemap/base_crawler.py +++ b/pysitemap/base_crawler.py @@ -197,8 +197,7 @@ class Crawler: # print("warning: ignoring tag data value:", key) if len(arg_dict) == 1: - tag.append(arg_dict) - tags.append(tag) + tags.append(arg_dict) return tags async def addtagdata(self, tagdata, url, source_url_field, @@ -209,48 +208,47 @@ class Crawler: :return: dictionary of validated tags (of single type) """ tags = [] - for data in tagdata: - for tag in data: - if not source_url_field in tag: - continue - if not await self.contains(tag[source_url_field], excludes, rlist=True): - - if this_domain: - if not tag[source_url_field].startswith('http'): - for tag_root_url in tag_root_urls: - if url.startswith(tag_root_url): - tag[source_url_field] = tag_root_url + tag[source_url_field] - break - else: - if not tag[source_url_field].startswith('http'): - continue - - if (tag[source_url_field].startswith('http') and - data not in done_list and - tag[source_url_field] not in self.busy and - tag[source_url_field] not in self.todo_queue): - self.todo_queue.add(tag[source_url_field]) - # Acquire semaphore - await self.sem.acquire() - # Create async task - task = asyncio.ensure_future(self.mimechecker(tag[source_url_field], mimetype)) - # Add collback into task to release semaphore - task.add_done_callback(lambda t: self.sem.release()) - # Callback to remove task from tasks - task.add_done_callback(self.tasks.remove) - # Add task into tasks - self.tasks.add(task) - try: - result = await asyncio.wait_for(task, timeout=20) - if (result): - tags.append(data) - - except asyncio.TimeoutError: - print("couldn't add tag data:", tag[source_url_field]) - task.cancel() - pass - - done_list.extend(tags) + for tag in tagdata: + if not source_url_field in tag: + continue + if not await self.contains(tag[source_url_field], excludes, rlist=True): + + if this_domain: + if not tag[source_url_field].startswith('http'): + for tag_root_url in tag_root_urls: + if url.startswith(tag_root_url): + tag[source_url_field] = tag_root_url + tag[source_url_field] + break + else: + if not tag[source_url_field].startswith('http'): + continue + + if (tag[source_url_field].startswith('http') and + tag not in done_list and + tag[source_url_field] not in self.busy and + tag[source_url_field] not in self.todo_queue): + self.todo_queue.add(tag[source_url_field]) + # Acquire semaphore + await self.sem.acquire() + # Create async task + task = asyncio.ensure_future(self.mimechecker(tag[source_url_field], mimetype)) + # Add collback into task to release semaphore + task.add_done_callback(lambda t: self.sem.release()) + # Callback to remove task from tasks + task.add_done_callback(self.tasks.remove) + # Add task into tasks + self.tasks.add(task) + try: + result = await asyncio.wait_for(task, timeout=20) + if (result): + tags.append(tag) + + except asyncio.TimeoutError: + print("couldn't add tag data:", tag[source_url_field]) + task.cancel() + pass + + done_list.extend(tags) return tags async def process(self, url): diff --git a/pysitemap/format_processors/xml.py b/pysitemap/format_processors/xml.py index 1b274a2..b8fb3ba 100644 --- a/pysitemap/format_processors/xml.py +++ b/pysitemap/format_processors/xml.py @@ -39,15 +39,14 @@ class XMLWriter(): if len(image_data) > 0: for image in image_data: - for arg in image: - image_xml = "" - if 'src' in arg: image_xml += "{}".format(arg['src']) - if 'title' in arg: image_xml += "{}".format(arg['title']) - if 'caption' in arg: image_xml += "{}".format(arg['caption']) - if 'geo_location' in arg: image_xml += "{}".format(arg['geo_location']) - if 'license' in arg: image_xml += "{}".format(arg['license']) - - url += "{}".format(image_xml) + image_xml = "" + if 'src' in image: image_xml += "{}".format(image['src']) + if 'title' in image: image_xml += "{}".format(image['title']) + if 'caption' in image: image_xml += "{}".format(image['caption']) + if 'geo_location' in image: image_xml += "{}".format(image['geo_location']) + if 'license' in image: image_xml += "{}".format(image['license']) + + url += "{}".format(image_xml) await writer('{}\n'.format(url))