Browse Source

Set 'last modified' (lastmod) tag optional

master
Pekka Helenius 4 years ago
parent
commit
ef19eb9815
4 changed files with 16 additions and 8 deletions
  1. +1
    -0
      README.md
  2. +6
    -5
      pysitemap/__init__.py
  3. +8
    -3
      pysitemap/base_crawler.py
  4. +1
    -0
      run.py

+ 1
- 0
README.md View File

@ -71,6 +71,7 @@ if __name__ == '__main__':
'https://mytestsite.com/photos/',
'https://mytestsite.com/git/',
],
use_lastmodified=False,
headers={'User-Agent': 'Crawler'},
# TZ offset in hours
timezone_offset=3,


+ 6
- 5
pysitemap/__init__.py View File

@ -6,8 +6,8 @@ from pysitemap.base_crawler import Crawler
def crawler(
root_url, out_file, out_format='xml',
maxtasks=10, exclude_urls=[], exclude_imgs=[], image_root_urls=[],
verifyssl=True, findimages=True, images_this_domain=True, headers=None,
timezone_offset=0, changefreq=None, priorities=None):
use_lastmodified=True, verifyssl=True, findimages=True, images_this_domain=True,
headers=None, timezone_offset=0, changefreq=None, priorities=None):
"""
run crowler
:param root_url: Site root url
@ -17,6 +17,7 @@ def crawler(
:param exclude_urls: excludable url paths
:param exclude_imgs: excludable img url paths
:param image_root_urls: recognized image root urls on the domain
:param use_lastmodified: enable or disable timestamps for fetched urls?
:param verifyssl: verify website certificate?
:param findimages: Find images references?
:param images_this_domain: Find images which refer to this domain only?
@ -30,9 +31,9 @@ def crawler(
c = Crawler(root_url, out_file=out_file, out_format=out_format,
maxtasks=maxtasks, exclude_urls=exclude_urls, exclude_imgs=exclude_imgs,
image_root_urls=image_root_urls, verifyssl=verifyssl, findimages=findimages,
images_this_domain=images_this_domain, headers=headers, timezone_offset=timezone_offset,
changefreq=changefreq, priorities=priorities)
image_root_urls=image_root_urls, use_lastmodified=use_lastmodified, verifyssl=verifyssl,
findimages=findimages, images_this_domain=images_this_domain, headers=headers,
timezone_offset=timezone_offset, changefreq=changefreq, priorities=priorities)
loop.run_until_complete(c.run())


+ 8
- 3
pysitemap/base_crawler.py View File

@ -15,8 +15,9 @@ class Crawler:
}
def __init__(self, rooturl, out_file, out_format='xml', maxtasks=10, exclude_urls=[], exclude_imgs=[],
image_root_urls=[], verifyssl=True, findimages=True, images_this_domain=True, headers=None, timezone_offset=0,
changefreq=None, priorities=None, todo_queue_backend=set, done_backend=dict, done_images=list):
image_root_urls=[], use_lastmodified=True, verifyssl=True, findimages=True, images_this_domain=True,
headers=None, timezone_offset=0, changefreq=None, priorities=None, todo_queue_backend=set,
done_backend=dict, done_images=list):
"""
Crawler constructor
:param rooturl: root url of site
@ -33,6 +34,8 @@ class Crawler:
:type exclude_imgs: list
:param image_root_urls: recognized image root urls on the domain
:type image_root_urls: list
:param use_lastmodified: enable or disable timestamps for fetched urls?
:type use_lastmodified: bool
:param verifyssl: verify website certificate?
:type verifyssl: bool
:param findimages: Find images references?
@ -49,6 +52,7 @@ class Crawler:
self.rooturl = rooturl
self.exclude_urls = exclude_urls
self.exclude_imgs = exclude_imgs
self.use_lastmodified = use_lastmodified
self.image_root_urls = image_root_urls
self.findimages = findimages
self.images_this_domain = images_this_domain
@ -281,7 +285,8 @@ class Crawler:
data = (await resp.read()).decode('utf-8', 'replace')
urls = re.findall(r'(?i)href=["\']?([^\s"\'<>]+)', data)
lastmod = resp.headers.get('last-modified')
if self.use_lastmodified:
lastmod = resp.headers.get('last-modified')
if self.findimages:
# Ref: https://support.google.com/webmasters/answer/178636?hl=en


+ 1
- 0
run.py View File

@ -30,6 +30,7 @@ if __name__ == '__main__':
'https://mytestsite.com/photos/',
'https://mytestsite.com/git/',
],
use_lastmodified=False,
headers={'User-Agent': 'Crawler'},
# TZ offset in hours
timezone_offset=3,


Loading…
Cancel
Save