From e65b8b39f2e37e62fc627cf80cde96dcedd5999f Mon Sep 17 00:00:00 2001 From: Pekka Helenius Date: Sun, 3 May 2020 22:00:55 +0300 Subject: [PATCH] Implement image crawler --- README.md | 6 ++++ pysitemap/__init__.py | 5 ++-- pysitemap/base_crawler.py | 46 ++++++++++++++++++++++++++---- pysitemap/format_processors/xml.py | 6 ++++ 4 files changed, 55 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index eb5c88a..3af4cee 100644 --- a/README.md +++ b/README.md @@ -59,6 +59,12 @@ if __name__ == '__main__': '[a-zA-Z0-9]*\.[a-zA-Z0-9]*$', '\?\.php', ], + exclude_imgs=[ + 'logo\.(png|jpg)', + 'avatars', + 'avatar_default', + '/symbols/' + ], headers={'User-Agent': 'Crawler'}, # TZ offset in hours timezone_offset=3, diff --git a/pysitemap/__init__.py b/pysitemap/__init__.py index 5a9bb92..9626064 100644 --- a/pysitemap/__init__.py +++ b/pysitemap/__init__.py @@ -5,7 +5,7 @@ from pysitemap.base_crawler import Crawler def crawler( root_url, out_file, out_format='xml', - maxtasks=10, exclude_urls=[], verifyssl=True, + maxtasks=10, exclude_urls=[], exclude_imgs=[], verifyssl=True, headers=None, timezone_offset=0, changefreq=None, priorities=None): """ @@ -15,6 +15,7 @@ def crawler( :param out_format: format of out file [xml, txt] :param maxtasks: max count of tasks :param exclude_urls: excludable url paths + :param exclude_imgs: excludable img url paths :param verifyssl: verify website certificate? :param headers: Send these headers in every request :param timezone_offset: timezone offset for lastmod tags @@ -25,7 +26,7 @@ def crawler( loop = asyncio.get_event_loop() c = Crawler(root_url, out_file=out_file, out_format=out_format, - maxtasks=maxtasks, exclude_urls=exclude_urls, verifyssl=verifyssl, + maxtasks=maxtasks, exclude_urls=exclude_urls, exclude_imgs=exclude_imgs, verifyssl=verifyssl, headers=headers, timezone_offset=timezone_offset, changefreq=changefreq, priorities=priorities) diff --git a/pysitemap/base_crawler.py b/pysitemap/base_crawler.py index 991c241..3ce1f88 100644 --- a/pysitemap/base_crawler.py +++ b/pysitemap/base_crawler.py @@ -14,8 +14,8 @@ class Crawler: 'txt': TextWriter } - def __init__(self, rooturl, out_file, out_format='xml', maxtasks=10, exclude_urls=[], verifyssl=True, - headers=None, timezone_offset=0, changefreq=None, priorities=None, + def __init__(self, rooturl, out_file, out_format='xml', maxtasks=100, exclude_urls=[], exclude_imgs=[], + verifyssl=True, headers=None, timezone_offset=0, changefreq=None, priorities=None, todo_queue_backend=set, done_backend=dict): """ Crawler constructor @@ -25,10 +25,12 @@ class Crawler: :type out_file: str :param out_format: sitemap type [xml | txt]. Default xml :type out_format: str - :param maxtasks: maximum count of tasks. Default 10 + :param maxtasks: maximum count of tasks. Default 100 :type maxtasks: int :param exclude_urls: excludable url paths relative to root url :type exclude_urls: list + :param exclude_imgs: excludable img url paths relative to root url + :type exclude_imgs: list :param verifyssl: verify website certificate? :type verifyssl: boolean :param timezone_offset: timezone offset for lastmod tags @@ -40,6 +42,7 @@ class Crawler: """ self.rooturl = rooturl self.exclude_urls = exclude_urls + self.exclude_imgs = exclude_imgs self.todo_queue = todo_queue_backend() self.busy = set() self.done = done_backend() @@ -119,6 +122,36 @@ class Crawler: # Add task into tasks self.tasks.add(task) + async def addimages(self, data): + """ + Find all images in website data + """ + imgs = [] + imgs_ok = [] + lines_tmp = [] + tag = False + for line in data.split('\n'): + if re.search(r'', line): + tag = False + lines_tmp.append(line) + continue + if re.search(r'\/>', line) and tag: + tag = False + if tag: + lines_tmp.append(line) + + imgs = re.findall(r'(?i)src=["\']?([^\s\"\'<>]+)', str(lines_tmp)) + + for img in imgs: + if not await self.contains(img, self.exclude_imgs, rlist=True): + if img.startswith(self.rooturl): + imgs_ok.append(img) + elif not img.startswith("http"): + imgs_ok.append(re.sub('/$', '', self.rooturl) + img) + return imgs_ok + async def process(self, url): """ Process single url @@ -134,13 +167,14 @@ class Crawler: lastmod = None cf = None pr = None + imgs = [] try: resp = await self.session.get(url) # await response except Exception as exc: # on any exception mark url as BAD print('...', url, 'has error', repr(str(exc))) - self.done[url] = [False, lastmod, cf, pr] + self.done[url] = [False, lastmod, cf, pr, imgs] else: # only url with status == 200 and content type == 'text/html' parsed if (resp.status == 200 and @@ -148,7 +182,7 @@ class Crawler: data = (await resp.read()).decode('utf-8', 'replace') urls = re.findall(r'(?i)href=["\']?([^\s"\'<>]+)', data) lastmod = resp.headers.get('last-modified') - + imgs = await self.addimages(data) asyncio.Task(self.addurls([(u, url) for u in urls])) try: pr = await self.urldict(url, self.changefreq) @@ -160,7 +194,7 @@ class Crawler: # even if we have no exception, we can mark url as good resp.close() - self.done[url] = [True, lastmod, cf, pr] + self.done[url] = [True, lastmod, cf, pr, imgs] self.busy.remove(url) logging.info(len(self.done), 'completed tasks,', len(self.tasks), diff --git a/pysitemap/format_processors/xml.py b/pysitemap/format_processors/xml.py index 300394c..9eea948 100644 --- a/pysitemap/format_processors/xml.py +++ b/pysitemap/format_processors/xml.py @@ -22,6 +22,8 @@ class XMLWriter(): timestamp = data[1][1] changefreq = data[1][2] priority = data[1][3] + images = data[1][4] + url = "{}".format(data[0]) if timestamp is not None: @@ -34,6 +36,10 @@ class XMLWriter(): if priority is not None: url += "{}".format(str(priority)) + if len(images) > 0: + for image in images: + url += "{}".format(str(image)) + await writer('{}\n'.format(url)) await aiodf.fsync()