|
@ -6,8 +6,8 @@ from pysitemap.base_crawler import Crawler |
|
|
def crawler( |
|
|
def crawler( |
|
|
root_url, out_file, out_format='xml', |
|
|
root_url, out_file, out_format='xml', |
|
|
maxtasks=10, exclude_urls=[], exclude_imgs=[], image_root_urls=[], |
|
|
maxtasks=10, exclude_urls=[], exclude_imgs=[], image_root_urls=[], |
|
|
verifyssl=True, findimages=True, images_this_domain=True, headers=None, |
|
|
|
|
|
timezone_offset=0, changefreq=None, priorities=None): |
|
|
|
|
|
|
|
|
use_lastmodified=True, verifyssl=True, findimages=True, images_this_domain=True, |
|
|
|
|
|
headers=None, timezone_offset=0, changefreq=None, priorities=None): |
|
|
""" |
|
|
""" |
|
|
run crowler |
|
|
run crowler |
|
|
:param root_url: Site root url |
|
|
:param root_url: Site root url |
|
@ -17,6 +17,7 @@ def crawler( |
|
|
:param exclude_urls: excludable url paths |
|
|
:param exclude_urls: excludable url paths |
|
|
:param exclude_imgs: excludable img url paths |
|
|
:param exclude_imgs: excludable img url paths |
|
|
:param image_root_urls: recognized image root urls on the domain |
|
|
:param image_root_urls: recognized image root urls on the domain |
|
|
|
|
|
:param use_lastmodified: enable or disable timestamps for fetched urls? |
|
|
:param verifyssl: verify website certificate? |
|
|
:param verifyssl: verify website certificate? |
|
|
:param findimages: Find images references? |
|
|
:param findimages: Find images references? |
|
|
:param images_this_domain: Find images which refer to this domain only? |
|
|
:param images_this_domain: Find images which refer to this domain only? |
|
@ -30,9 +31,9 @@ def crawler( |
|
|
|
|
|
|
|
|
c = Crawler(root_url, out_file=out_file, out_format=out_format, |
|
|
c = Crawler(root_url, out_file=out_file, out_format=out_format, |
|
|
maxtasks=maxtasks, exclude_urls=exclude_urls, exclude_imgs=exclude_imgs, |
|
|
maxtasks=maxtasks, exclude_urls=exclude_urls, exclude_imgs=exclude_imgs, |
|
|
image_root_urls=image_root_urls, verifyssl=verifyssl, findimages=findimages, |
|
|
|
|
|
images_this_domain=images_this_domain, headers=headers, timezone_offset=timezone_offset, |
|
|
|
|
|
changefreq=changefreq, priorities=priorities) |
|
|
|
|
|
|
|
|
image_root_urls=image_root_urls, use_lastmodified=use_lastmodified, verifyssl=verifyssl, |
|
|
|
|
|
findimages=findimages, images_this_domain=images_this_domain, headers=headers, |
|
|
|
|
|
timezone_offset=timezone_offset, changefreq=changefreq, priorities=priorities) |
|
|
|
|
|
|
|
|
loop.run_until_complete(c.run()) |
|
|
loop.run_until_complete(c.run()) |
|
|
|
|
|
|
|
|