|
import sys
|
|
import logging
|
|
from pysitemap import crawler
|
|
|
|
if __name__ == '__main__':
|
|
root_url = 'https://mytestsite.com/'
|
|
crawler(
|
|
root_url,
|
|
out_file='sitemap.xml',
|
|
maxtasks=100,
|
|
verifyssl=False,
|
|
findimages=True,
|
|
images_this_domain=True,
|
|
exclude_urls=[
|
|
'/git/.*(action|commit|stars|activity|followers|following|\?sort|issues|pulls|milestones|archive|/labels$|/wiki$|/releases$|/forks$|/watchers$)',
|
|
'/git/user/(sign_up|login|forgot_password)',
|
|
'/css',
|
|
'/js',
|
|
'favicon',
|
|
'[a-zA-Z0-9]*\.[a-zA-Z0-9]*$',
|
|
'\?\.php',
|
|
],
|
|
exclude_imgs=[
|
|
'logo\.(png|jpg)',
|
|
'avatars',
|
|
'avatar_default',
|
|
'/symbols/'
|
|
],
|
|
image_root_urls=[
|
|
'https://mytestsite.com/photos/',
|
|
'https://mytestsite.com/git/',
|
|
],
|
|
headers={'User-Agent': 'Crawler'},
|
|
# TZ offset in hours
|
|
timezone_offset=3,
|
|
changefreq={
|
|
"/git/": "weekly",
|
|
"/": "monthly"
|
|
},
|
|
priorities={
|
|
"/git/": 0.7,
|
|
"/metasub/": 0.6,
|
|
"/": 0.5
|
|
}
|
|
)
|
|
|