Browse Source

Set image url/data crawling optional

master
Pekka Helenius 4 years ago
parent
commit
17d60efdec
4 changed files with 34 additions and 22 deletions
  1. +2
    -0
      README.md
  2. +7
    -5
      pysitemap/__init__.py
  3. +23
    -17
      pysitemap/base_crawler.py
  4. +2
    -0
      run.py

+ 2
- 0
README.md View File

@ -50,6 +50,8 @@ if __name__ == '__main__':
out_file='sitemap.xml', out_file='sitemap.xml',
maxtasks=100, maxtasks=100,
verifyssl=False, verifyssl=False,
findimages=True,
images_this_domain=True,
exclude_urls=[ exclude_urls=[
'/git/.*(action|commit|stars|activity|followers|following|\?sort|issues|pulls|milestones|archive|/labels$|/wiki$|/releases$|/forks$|/watchers$)', '/git/.*(action|commit|stars|activity|followers|following|\?sort|issues|pulls|milestones|archive|/labels$|/wiki$|/releases$|/forks$|/watchers$)',
'/git/user/(sign_up|login|forgot_password)', '/git/user/(sign_up|login|forgot_password)',


+ 7
- 5
pysitemap/__init__.py View File

@ -5,9 +5,9 @@ from pysitemap.base_crawler import Crawler
def crawler( def crawler(
root_url, out_file, out_format='xml', root_url, out_file, out_format='xml',
maxtasks=10, exclude_urls=list, exclude_imgs=list, image_root_urls=list,
verifyssl=True, headers=None, timezone_offset=0, changefreq=None,
priorities=None):
maxtasks=10, exclude_urls=[], exclude_imgs=[], image_root_urls=[],
verifyssl=True, findimages=True, images_this_domain=True, headers=None,
timezone_offset=0, changefreq=None, priorities=None):
""" """
run crowler run crowler
:param root_url: Site root url :param root_url: Site root url
@ -18,6 +18,8 @@ def crawler(
:param exclude_imgs: excludable img url paths :param exclude_imgs: excludable img url paths
:param image_root_urls: recognized image root urls on the domain :param image_root_urls: recognized image root urls on the domain
:param verifyssl: verify website certificate? :param verifyssl: verify website certificate?
:param findimages: Find images references?
:param images_this_domain: Find images which refer to this domain only?
:param headers: Send these headers in every request :param headers: Send these headers in every request
:param timezone_offset: timezone offset for lastmod tags :param timezone_offset: timezone offset for lastmod tags
:param changefreq: dictionary, where key is site sub url regex, and value is changefreq :param changefreq: dictionary, where key is site sub url regex, and value is changefreq
@ -28,8 +30,8 @@ def crawler(
c = Crawler(root_url, out_file=out_file, out_format=out_format, c = Crawler(root_url, out_file=out_file, out_format=out_format,
maxtasks=maxtasks, exclude_urls=exclude_urls, exclude_imgs=exclude_imgs, maxtasks=maxtasks, exclude_urls=exclude_urls, exclude_imgs=exclude_imgs,
image_root_urls=image_root_urls, verifyssl=verifyssl,
headers=headers, timezone_offset=timezone_offset,
image_root_urls=image_root_urls, verifyssl=verifyssl, findimages=findimages,
images_this_domain=images_this_domain, headers=headers, timezone_offset=timezone_offset,
changefreq=changefreq, priorities=priorities) changefreq=changefreq, priorities=priorities)
loop.run_until_complete(c.run()) loop.run_until_complete(c.run())


+ 23
- 17
pysitemap/base_crawler.py View File

@ -15,8 +15,8 @@ class Crawler:
} }
def __init__(self, rooturl, out_file, out_format='xml', maxtasks=10, exclude_urls=[], exclude_imgs=[], def __init__(self, rooturl, out_file, out_format='xml', maxtasks=10, exclude_urls=[], exclude_imgs=[],
image_root_urls=[], verifyssl=True, headers=None, timezone_offset=0, changefreq=None, priorities=None,
todo_queue_backend=set, done_backend=dict, done_images=list):
image_root_urls=[], verifyssl=True, findimages=True, images_this_domain=True, headers=None, timezone_offset=0,
changefreq=None, priorities=None, todo_queue_backend=set, done_backend=dict, done_images=list):
""" """
Crawler constructor Crawler constructor
:param rooturl: root url of site :param rooturl: root url of site
@ -34,7 +34,11 @@ class Crawler:
:param image_root_urls: recognized image root urls on the domain :param image_root_urls: recognized image root urls on the domain
:type image_root_urls: list :type image_root_urls: list
:param verifyssl: verify website certificate? :param verifyssl: verify website certificate?
:type verifyssl: boolean
:type verifyssl: bool
:param findimages: Find images references?
:type findimages: bool
:param images_this_domain: Find images which refer to this domain only?
:type images_this_domain: bool
:param timezone_offset: timezone offset for lastmod tags :param timezone_offset: timezone offset for lastmod tags
:type timezone_offset: int :type timezone_offset: int
:param changefreq: dictionary, where key is site sub url regex, and value is changefreq :param changefreq: dictionary, where key is site sub url regex, and value is changefreq
@ -46,6 +50,8 @@ class Crawler:
self.exclude_urls = exclude_urls self.exclude_urls = exclude_urls
self.exclude_imgs = exclude_imgs self.exclude_imgs = exclude_imgs
self.image_root_urls = image_root_urls self.image_root_urls = image_root_urls
self.findimages = findimages
self.images_this_domain = images_this_domain
self.todo_queue = todo_queue_backend() self.todo_queue = todo_queue_backend()
self.busy = set() self.busy = set()
self.done = done_backend() self.done = done_backend()
@ -184,7 +190,6 @@ class Crawler:
# Remove leading and trailing quote marks from value # Remove leading and trailing quote marks from value
value = re.sub(r'^["\']?(.*?)["\']?$', '\\1', arg[1]) value = re.sub(r'^["\']?(.*?)["\']?$', '\\1', arg[1])
value = re.sub(r'&', '&', value) value = re.sub(r'&', '&', value)
for field in fields: for field in fields:
if key == field: if key == field:
arg_dict[field] = value arg_dict[field] = value
@ -280,19 +285,20 @@ class Crawler:
lastmod = resp.headers.get('last-modified') lastmod = resp.headers.get('last-modified')
# Ref: https://support.google.com/webmasters/answer/178636?hl=en
img_data = await self.fetchtags(
data, url, 'img',
fields=['src', 'title', 'caption', 'geo_location', 'license']
)
imgs = await self.addtagdata(
tagdata=img_data, url=url,
source_url_field='src', mimetype='^image\/',
tag_root_urls=self.image_root_urls,
excludes=self.exclude_imgs,
done_list=self.done_images,
this_domain=True
)
if self.findimages:
# Ref: https://support.google.com/webmasters/answer/178636?hl=en
img_data = await self.fetchtags(
data, url, 'img',
fields=['src', 'title', 'caption', 'geo_location', 'license']
)
imgs = await self.addtagdata(
tagdata=img_data, url=url,
source_url_field='src', mimetype='^image\/',
tag_root_urls=self.image_root_urls,
excludes=self.exclude_imgs,
done_list=self.done_images,
this_domain=self.images_this_domain
)
asyncio.Task(self.addurls([(u, url) for u in urls])) asyncio.Task(self.addurls([(u, url) for u in urls]))


+ 2
- 0
run.py View File

@ -9,6 +9,8 @@ if __name__ == '__main__':
out_file='sitemap.xml', out_file='sitemap.xml',
maxtasks=100, maxtasks=100,
verifyssl=False, verifyssl=False,
findimages=True,
images_this_domain=True,
exclude_urls=[ exclude_urls=[
'/git/.*(action|commit|stars|activity|followers|following|\?sort|issues|pulls|milestones|archive|/labels$|/wiki$|/releases$|/forks$|/watchers$)', '/git/.*(action|commit|stars|activity|followers|following|\?sort|issues|pulls|milestones|archive|/labels$|/wiki$|/releases$|/forks$|/watchers$)',
'/git/user/(sign_up|login|forgot_password)', '/git/user/(sign_up|login|forgot_password)',


Loading…
Cancel
Save