diff --git a/pysitemap/base_crawler.py b/pysitemap/base_crawler.py index 37867ac..2b669c8 100644 --- a/pysitemap/base_crawler.py +++ b/pysitemap/base_crawler.py @@ -14,8 +14,8 @@ class Crawler: 'txt': TextWriter } - def __init__(self, rooturl, out_file, out_format='xml', maxtasks=100, exclude_urls=[], exclude_imgs=[], - verifyssl=True, headers=None, timezone_offset=0, changefreq=None, priorities=None, + def __init__(self, rooturl, out_file, out_format='xml', maxtasks=10, exclude_urls=list, exclude_imgs=list, + image_root_urls=list, verifyssl=True, headers=None, timezone_offset=0, changefreq=None, priorities=None, todo_queue_backend=set, done_backend=dict, done_images=list): """ Crawler constructor @@ -25,12 +25,14 @@ class Crawler: :type out_file: str :param out_format: sitemap type [xml | txt]. Default xml :type out_format: str - :param maxtasks: maximum count of tasks. Default 100 + :param maxtasks: maximum count of tasks. Default 10 :type maxtasks: int :param exclude_urls: excludable url paths relative to root url :type exclude_urls: list :param exclude_imgs: excludable img url paths relative to root url :type exclude_imgs: list + :param image_root_urls: recognized image root urls on the domain + :type image_root_urls: list :param verifyssl: verify website certificate? :type verifyssl: boolean :param timezone_offset: timezone offset for lastmod tags @@ -43,6 +45,7 @@ class Crawler: self.rooturl = rooturl self.exclude_urls = exclude_urls self.exclude_imgs = exclude_imgs + self.image_root_urls = image_root_urls self.todo_queue = todo_queue_backend() self.busy = set() self.done = done_backend() @@ -127,6 +130,10 @@ class Crawler: """ Check url resource mimetype """ + + self.todo_queue.remove(url) + self.busy.add(url) + try: resp = await self.session.get(url) except Exception as exc: @@ -135,10 +142,14 @@ class Crawler: mime = resp.headers.get('content-type') if (resp.status == 200 and bool(re.search(re.compile(r"{}".format(expected)), mime))): + resp.close() + self.busy.remove(url) return True + resp.close() + self.busy.remove(url) return False - async def addimages(self, data): + async def addimages(self, data, url): """ Find all images in website data """ @@ -161,15 +172,41 @@ class Crawler: imgs = re.findall(r'(?i)src=["\']?([^\s\"\'<>]+)', str(lines_tmp)) for img in imgs: + image_url = "" if not await self.contains(img, self.exclude_imgs, rlist=True): + if img.startswith(self.rooturl): - if (await self.mimechecker(img, '^image\/') and - img not in self.done_images): - imgs_ok.append(img) + image_url = img + elif not img.startswith("http"): - if (await self.mimechecker(img, '^image\/') and - img not in self.done_images): - imgs_ok.append(re.sub('/$', '', self.rooturl) + img) + for image_root_url in self.image_root_urls: + if url.startswith(image_root_url): + image_url = image_root_url + img + break + + if (image_url != "" and + image_url not in self.done_images and + image_url not in self.busy and + image_url not in self.todo_queue): + self.todo_queue.add(image_url) + # Acquire semaphore + await self.sem.acquire() + # Create async task + task = asyncio.ensure_future(self.mimechecker(image_url, '^image\/')) + # Add collback into task to release semaphore + task.add_done_callback(lambda t: self.sem.release()) + # Callback to remove task from tasks + task.add_done_callback(self.tasks.remove) + # Add task into tasks + self.tasks.add(task) + try: + result = await asyncio.wait_for(task, timeout=20) + if (result): + imgs_ok.append(image_url) + except asyncio.TimeoutError: + print("couldn't add image:", image_url) + task.cancel() + pass self.done_images.extend(imgs_ok) return imgs_ok @@ -204,7 +241,7 @@ class Crawler: data = (await resp.read()).decode('utf-8', 'replace') urls = re.findall(r'(?i)href=["\']?([^\s"\'<>]+)', data) lastmod = resp.headers.get('last-modified') - imgs = await self.addimages(data) + imgs = await self.addimages(data, url) asyncio.Task(self.addurls([(u, url) for u in urls])) try: pr = await self.urldict(url, self.changefreq)