diff --git a/README.md b/README.md index 3af4cee..6f9c818 100644 --- a/README.md +++ b/README.md @@ -65,6 +65,10 @@ if __name__ == '__main__': 'avatar_default', '/symbols/' ], + image_root_urls=[ + 'https://mytestsite.com/photos/', + 'https://mytestsite.com/git/', + ], headers={'User-Agent': 'Crawler'}, # TZ offset in hours timezone_offset=3, diff --git a/pysitemap/__init__.py b/pysitemap/__init__.py index 9626064..2f5bdbd 100644 --- a/pysitemap/__init__.py +++ b/pysitemap/__init__.py @@ -5,8 +5,8 @@ from pysitemap.base_crawler import Crawler def crawler( root_url, out_file, out_format='xml', - maxtasks=10, exclude_urls=[], exclude_imgs=[], verifyssl=True, - headers=None, timezone_offset=0, changefreq=None, + maxtasks=10, exclude_urls=list, exclude_imgs=list, image_root_urls=list, + verifyssl=True, headers=None, timezone_offset=0, changefreq=None, priorities=None): """ run crowler @@ -16,6 +16,7 @@ def crawler( :param maxtasks: max count of tasks :param exclude_urls: excludable url paths :param exclude_imgs: excludable img url paths + :param image_root_urls: recognized image root urls on the domain :param verifyssl: verify website certificate? :param headers: Send these headers in every request :param timezone_offset: timezone offset for lastmod tags @@ -26,7 +27,8 @@ def crawler( loop = asyncio.get_event_loop() c = Crawler(root_url, out_file=out_file, out_format=out_format, - maxtasks=maxtasks, exclude_urls=exclude_urls, exclude_imgs=exclude_imgs, verifyssl=verifyssl, + maxtasks=maxtasks, exclude_urls=exclude_urls, exclude_imgs=exclude_imgs, + image_root_urls=image_root_urls, verifyssl=verifyssl, headers=headers, timezone_offset=timezone_offset, changefreq=changefreq, priorities=priorities)