Browse Source

Replace run.py

master
Pekka Helenius 4 years ago
parent
commit
a86f76a7c4
1 changed files with 38 additions and 9 deletions
  1. +38
    -9
      run.py

+ 38
- 9
run.py View File

@ -3,13 +3,42 @@ import logging
from pysitemap import crawler from pysitemap import crawler
if __name__ == '__main__': if __name__ == '__main__':
if '--iocp' in sys.argv:
from asyncio import events, windows_events
sys.argv.remove('--iocp')
logging.info('using iocp')
el = windows_events.ProactorEventLoop()
events.set_event_loop(el)
root_url = 'https://mytestsite.com/'
crawler(
root_url,
out_file='sitemap.xml',
maxtasks=100,
verifyssl=False,
exclude_urls=[
'/git/.*(action|commit|stars|activity|followers|following|\?sort|issues|pulls|milestones|archive|/labels$|/wiki$|/releases$|/forks$|/watchers$)',
'/git/user/(sign_up|login|forgot_password)',
'/css',
'/js',
'favicon',
'[a-zA-Z0-9]*\.[a-zA-Z0-9]*$',
'\?\.php',
],
exclude_imgs=[
'logo\.(png|jpg)',
'avatars',
'avatar_default',
'/symbols/'
],
image_root_urls=[
'https://mytestsite.com/photos/',
'https://mytestsite.com/git/',
],
headers={'User-Agent': 'Crawler'},
# TZ offset in hours
timezone_offset=3,
changefreq={
"/git/": "weekly",
"/": "monthly"
},
priorities={
"/git/": 0.7,
"/metasub/": 0.6,
"/": 0.5
}
)
# root_url = sys.argv[1]
root_url = 'https://www.metpromstroi.ru'
crawler(root_url, out_file='sitemap.xml')

Loading…
Cancel
Save