|
@ -14,8 +14,8 @@ class Crawler: |
|
|
'txt': TextWriter |
|
|
'txt': TextWriter |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
def __init__(self, rooturl, out_file, out_format='xml', maxtasks=100, exclude_urls=[], exclude_imgs=[], |
|
|
|
|
|
verifyssl=True, headers=None, timezone_offset=0, changefreq=None, priorities=None, |
|
|
|
|
|
|
|
|
def __init__(self, rooturl, out_file, out_format='xml', maxtasks=10, exclude_urls=list, exclude_imgs=list, |
|
|
|
|
|
image_root_urls=list, verifyssl=True, headers=None, timezone_offset=0, changefreq=None, priorities=None, |
|
|
todo_queue_backend=set, done_backend=dict, done_images=list): |
|
|
todo_queue_backend=set, done_backend=dict, done_images=list): |
|
|
""" |
|
|
""" |
|
|
Crawler constructor |
|
|
Crawler constructor |
|
@ -25,12 +25,14 @@ class Crawler: |
|
|
:type out_file: str |
|
|
:type out_file: str |
|
|
:param out_format: sitemap type [xml | txt]. Default xml |
|
|
:param out_format: sitemap type [xml | txt]. Default xml |
|
|
:type out_format: str |
|
|
:type out_format: str |
|
|
:param maxtasks: maximum count of tasks. Default 100 |
|
|
|
|
|
|
|
|
:param maxtasks: maximum count of tasks. Default 10 |
|
|
:type maxtasks: int |
|
|
:type maxtasks: int |
|
|
:param exclude_urls: excludable url paths relative to root url |
|
|
:param exclude_urls: excludable url paths relative to root url |
|
|
:type exclude_urls: list |
|
|
:type exclude_urls: list |
|
|
:param exclude_imgs: excludable img url paths relative to root url |
|
|
:param exclude_imgs: excludable img url paths relative to root url |
|
|
:type exclude_imgs: list |
|
|
:type exclude_imgs: list |
|
|
|
|
|
:param image_root_urls: recognized image root urls on the domain |
|
|
|
|
|
:type image_root_urls: list |
|
|
:param verifyssl: verify website certificate? |
|
|
:param verifyssl: verify website certificate? |
|
|
:type verifyssl: boolean |
|
|
:type verifyssl: boolean |
|
|
:param timezone_offset: timezone offset for lastmod tags |
|
|
:param timezone_offset: timezone offset for lastmod tags |
|
@ -43,6 +45,7 @@ class Crawler: |
|
|
self.rooturl = rooturl |
|
|
self.rooturl = rooturl |
|
|
self.exclude_urls = exclude_urls |
|
|
self.exclude_urls = exclude_urls |
|
|
self.exclude_imgs = exclude_imgs |
|
|
self.exclude_imgs = exclude_imgs |
|
|
|
|
|
self.image_root_urls = image_root_urls |
|
|
self.todo_queue = todo_queue_backend() |
|
|
self.todo_queue = todo_queue_backend() |
|
|
self.busy = set() |
|
|
self.busy = set() |
|
|
self.done = done_backend() |
|
|
self.done = done_backend() |
|
@ -127,6 +130,10 @@ class Crawler: |
|
|
""" |
|
|
""" |
|
|
Check url resource mimetype |
|
|
Check url resource mimetype |
|
|
""" |
|
|
""" |
|
|
|
|
|
|
|
|
|
|
|
self.todo_queue.remove(url) |
|
|
|
|
|
self.busy.add(url) |
|
|
|
|
|
|
|
|
try: |
|
|
try: |
|
|
resp = await self.session.get(url) |
|
|
resp = await self.session.get(url) |
|
|
except Exception as exc: |
|
|
except Exception as exc: |
|
@ -135,10 +142,14 @@ class Crawler: |
|
|
mime = resp.headers.get('content-type') |
|
|
mime = resp.headers.get('content-type') |
|
|
if (resp.status == 200 and |
|
|
if (resp.status == 200 and |
|
|
bool(re.search(re.compile(r"{}".format(expected)), mime))): |
|
|
bool(re.search(re.compile(r"{}".format(expected)), mime))): |
|
|
|
|
|
resp.close() |
|
|
|
|
|
self.busy.remove(url) |
|
|
return True |
|
|
return True |
|
|
|
|
|
resp.close() |
|
|
|
|
|
self.busy.remove(url) |
|
|
return False |
|
|
return False |
|
|
|
|
|
|
|
|
async def addimages(self, data): |
|
|
|
|
|
|
|
|
async def addimages(self, data, url): |
|
|
""" |
|
|
""" |
|
|
Find all images in website data |
|
|
Find all images in website data |
|
|
""" |
|
|
""" |
|
@ -161,15 +172,41 @@ class Crawler: |
|
|
imgs = re.findall(r'(?i)src=["\']?([^\s\"\'<>]+)', str(lines_tmp)) |
|
|
imgs = re.findall(r'(?i)src=["\']?([^\s\"\'<>]+)', str(lines_tmp)) |
|
|
|
|
|
|
|
|
for img in imgs: |
|
|
for img in imgs: |
|
|
|
|
|
image_url = "" |
|
|
if not await self.contains(img, self.exclude_imgs, rlist=True): |
|
|
if not await self.contains(img, self.exclude_imgs, rlist=True): |
|
|
|
|
|
|
|
|
if img.startswith(self.rooturl): |
|
|
if img.startswith(self.rooturl): |
|
|
if (await self.mimechecker(img, '^image\/') and |
|
|
|
|
|
img not in self.done_images): |
|
|
|
|
|
imgs_ok.append(img) |
|
|
|
|
|
|
|
|
image_url = img |
|
|
|
|
|
|
|
|
elif not img.startswith("http"): |
|
|
elif not img.startswith("http"): |
|
|
if (await self.mimechecker(img, '^image\/') and |
|
|
|
|
|
img not in self.done_images): |
|
|
|
|
|
imgs_ok.append(re.sub('/$', '', self.rooturl) + img) |
|
|
|
|
|
|
|
|
for image_root_url in self.image_root_urls: |
|
|
|
|
|
if url.startswith(image_root_url): |
|
|
|
|
|
image_url = image_root_url + img |
|
|
|
|
|
break |
|
|
|
|
|
|
|
|
|
|
|
if (image_url != "" and |
|
|
|
|
|
image_url not in self.done_images and |
|
|
|
|
|
image_url not in self.busy and |
|
|
|
|
|
image_url not in self.todo_queue): |
|
|
|
|
|
self.todo_queue.add(image_url) |
|
|
|
|
|
# Acquire semaphore |
|
|
|
|
|
await self.sem.acquire() |
|
|
|
|
|
# Create async task |
|
|
|
|
|
task = asyncio.ensure_future(self.mimechecker(image_url, '^image\/')) |
|
|
|
|
|
# Add collback into task to release semaphore |
|
|
|
|
|
task.add_done_callback(lambda t: self.sem.release()) |
|
|
|
|
|
# Callback to remove task from tasks |
|
|
|
|
|
task.add_done_callback(self.tasks.remove) |
|
|
|
|
|
# Add task into tasks |
|
|
|
|
|
self.tasks.add(task) |
|
|
|
|
|
try: |
|
|
|
|
|
result = await asyncio.wait_for(task, timeout=20) |
|
|
|
|
|
if (result): |
|
|
|
|
|
imgs_ok.append(image_url) |
|
|
|
|
|
except asyncio.TimeoutError: |
|
|
|
|
|
print("couldn't add image:", image_url) |
|
|
|
|
|
task.cancel() |
|
|
|
|
|
pass |
|
|
|
|
|
|
|
|
self.done_images.extend(imgs_ok) |
|
|
self.done_images.extend(imgs_ok) |
|
|
return imgs_ok |
|
|
return imgs_ok |
|
@ -204,7 +241,7 @@ class Crawler: |
|
|
data = (await resp.read()).decode('utf-8', 'replace') |
|
|
data = (await resp.read()).decode('utf-8', 'replace') |
|
|
urls = re.findall(r'(?i)href=["\']?([^\s"\'<>]+)', data) |
|
|
urls = re.findall(r'(?i)href=["\']?([^\s"\'<>]+)', data) |
|
|
lastmod = resp.headers.get('last-modified') |
|
|
lastmod = resp.headers.get('last-modified') |
|
|
imgs = await self.addimages(data) |
|
|
|
|
|
|
|
|
imgs = await self.addimages(data, url) |
|
|
asyncio.Task(self.addurls([(u, url) for u in urls])) |
|
|
asyncio.Task(self.addurls([(u, url) for u in urls])) |
|
|
|
|
|
|
|
|
try: pr = await self.urldict(url, self.changefreq) |
|
|
try: pr = await self.urldict(url, self.changefreq) |
|
|