diff --git a/README.md b/README.md index a42e488..69bbc83 100644 --- a/README.md +++ b/README.md @@ -71,6 +71,7 @@ if __name__ == '__main__': 'https://mytestsite.com/photos/', 'https://mytestsite.com/git/', ], + use_lastmodified=False, headers={'User-Agent': 'Crawler'}, # TZ offset in hours timezone_offset=3, diff --git a/pysitemap/__init__.py b/pysitemap/__init__.py index cc47875..196feab 100644 --- a/pysitemap/__init__.py +++ b/pysitemap/__init__.py @@ -6,8 +6,8 @@ from pysitemap.base_crawler import Crawler def crawler( root_url, out_file, out_format='xml', maxtasks=10, exclude_urls=[], exclude_imgs=[], image_root_urls=[], - verifyssl=True, findimages=True, images_this_domain=True, headers=None, - timezone_offset=0, changefreq=None, priorities=None): + use_lastmodified=True, verifyssl=True, findimages=True, images_this_domain=True, + headers=None, timezone_offset=0, changefreq=None, priorities=None): """ run crowler :param root_url: Site root url @@ -17,6 +17,7 @@ def crawler( :param exclude_urls: excludable url paths :param exclude_imgs: excludable img url paths :param image_root_urls: recognized image root urls on the domain + :param use_lastmodified: enable or disable timestamps for fetched urls? :param verifyssl: verify website certificate? :param findimages: Find images references? :param images_this_domain: Find images which refer to this domain only? @@ -30,9 +31,9 @@ def crawler( c = Crawler(root_url, out_file=out_file, out_format=out_format, maxtasks=maxtasks, exclude_urls=exclude_urls, exclude_imgs=exclude_imgs, - image_root_urls=image_root_urls, verifyssl=verifyssl, findimages=findimages, - images_this_domain=images_this_domain, headers=headers, timezone_offset=timezone_offset, - changefreq=changefreq, priorities=priorities) + image_root_urls=image_root_urls, use_lastmodified=use_lastmodified, verifyssl=verifyssl, + findimages=findimages, images_this_domain=images_this_domain, headers=headers, + timezone_offset=timezone_offset, changefreq=changefreq, priorities=priorities) loop.run_until_complete(c.run()) diff --git a/pysitemap/base_crawler.py b/pysitemap/base_crawler.py index ceabea7..f2d3983 100644 --- a/pysitemap/base_crawler.py +++ b/pysitemap/base_crawler.py @@ -15,8 +15,9 @@ class Crawler: } def __init__(self, rooturl, out_file, out_format='xml', maxtasks=10, exclude_urls=[], exclude_imgs=[], - image_root_urls=[], verifyssl=True, findimages=True, images_this_domain=True, headers=None, timezone_offset=0, - changefreq=None, priorities=None, todo_queue_backend=set, done_backend=dict, done_images=list): + image_root_urls=[], use_lastmodified=True, verifyssl=True, findimages=True, images_this_domain=True, + headers=None, timezone_offset=0, changefreq=None, priorities=None, todo_queue_backend=set, + done_backend=dict, done_images=list): """ Crawler constructor :param rooturl: root url of site @@ -33,6 +34,8 @@ class Crawler: :type exclude_imgs: list :param image_root_urls: recognized image root urls on the domain :type image_root_urls: list + :param use_lastmodified: enable or disable timestamps for fetched urls? + :type use_lastmodified: bool :param verifyssl: verify website certificate? :type verifyssl: bool :param findimages: Find images references? @@ -49,6 +52,7 @@ class Crawler: self.rooturl = rooturl self.exclude_urls = exclude_urls self.exclude_imgs = exclude_imgs + self.use_lastmodified = use_lastmodified self.image_root_urls = image_root_urls self.findimages = findimages self.images_this_domain = images_this_domain @@ -281,7 +285,8 @@ class Crawler: data = (await resp.read()).decode('utf-8', 'replace') urls = re.findall(r'(?i)href=["\']?([^\s"\'<>]+)', data) - lastmod = resp.headers.get('last-modified') + if self.use_lastmodified: + lastmod = resp.headers.get('last-modified') if self.findimages: # Ref: https://support.google.com/webmasters/answer/178636?hl=en diff --git a/run.py b/run.py index e0a0aa1..0815fbd 100644 --- a/run.py +++ b/run.py @@ -30,6 +30,7 @@ if __name__ == '__main__': 'https://mytestsite.com/photos/', 'https://mytestsite.com/git/', ], + use_lastmodified=False, headers={'User-Agent': 'Crawler'}, # TZ offset in hours timezone_offset=3,