Browse Source

Introduce 'image_root_urls' parameter

master
Pekka Helenius 4 years ago
parent
commit
45340e119c
2 changed files with 9 additions and 3 deletions
  1. +4
    -0
      README.md
  2. +5
    -3
      pysitemap/__init__.py

+ 4
- 0
README.md View File

@ -65,6 +65,10 @@ if __name__ == '__main__':
'avatar_default',
'/symbols/'
],
image_root_urls=[
'https://mytestsite.com/photos/',
'https://mytestsite.com/git/',
],
headers={'User-Agent': 'Crawler'},
# TZ offset in hours
timezone_offset=3,


+ 5
- 3
pysitemap/__init__.py View File

@ -5,8 +5,8 @@ from pysitemap.base_crawler import Crawler
def crawler(
root_url, out_file, out_format='xml',
maxtasks=10, exclude_urls=[], exclude_imgs=[], verifyssl=True,
headers=None, timezone_offset=0, changefreq=None,
maxtasks=10, exclude_urls=list, exclude_imgs=list, image_root_urls=list,
verifyssl=True, headers=None, timezone_offset=0, changefreq=None,
priorities=None):
"""
run crowler
@ -16,6 +16,7 @@ def crawler(
:param maxtasks: max count of tasks
:param exclude_urls: excludable url paths
:param exclude_imgs: excludable img url paths
:param image_root_urls: recognized image root urls on the domain
:param verifyssl: verify website certificate?
:param headers: Send these headers in every request
:param timezone_offset: timezone offset for lastmod tags
@ -26,7 +27,8 @@ def crawler(
loop = asyncio.get_event_loop()
c = Crawler(root_url, out_file=out_file, out_format=out_format,
maxtasks=maxtasks, exclude_urls=exclude_urls, exclude_imgs=exclude_imgs, verifyssl=verifyssl,
maxtasks=maxtasks, exclude_urls=exclude_urls, exclude_imgs=exclude_imgs,
image_root_urls=image_root_urls, verifyssl=verifyssl,
headers=headers, timezone_offset=timezone_offset,
changefreq=changefreq, priorities=priorities)


Loading…
Cancel
Save