diff --git a/pysitemap/__init__.py b/pysitemap/__init__.py new file mode 100644 index 0000000..bd13691 --- /dev/null +++ b/pysitemap/__init__.py @@ -0,0 +1 @@ +from pysitemap.crawler import Crawler \ No newline at end of file diff --git a/pysitemap/crawler.py b/pysitemap/crawler.py index 228465a..378ad7a 100644 --- a/pysitemap/crawler.py +++ b/pysitemap/crawler.py @@ -3,25 +3,53 @@ import time import asyncio from aiohttp import ClientSession +from parsel import Selector +from urllib.parse import urlparse, urlunparse -class Knocker(object): - def __init__(self, urls=None, sleep_time=.5): - self.urls = urls or [] + +class Crawler(object): + def __init__(self, url, sleep_time=.5): + + self.urls = [url] + scheme, netloc, path, params, query, fragment = urlparse(url) + if not netloc: + netloc, path = path, netloc + url = urlunparse((scheme, netloc, "", params, "", fragment)) + self.base_url = url self.sleep_time = float(sleep_time) async def fetch(self, url, session): async with session.get(url) as response: await asyncio.sleep(self.sleep_time) - status = response.status - date = response.headers.get("DATE") - print("{}:{} with status {}".format(date, response.url, status)) - return url, status + return response.content async def bound_fetch(self, sem, url, session): # Getter function with semaphore. async with sem: await self.fetch(url, session) + def norm_link(self, url:str): + if url.startswith(self.base_url): + return url + elif url.startswith('//'): + return "{scheme}{url}".format( + scheme=self.base_url[:self.base_url.find(":")], + url=url + ) + elif url.startswith("/"): + return self.base_url + url + return None + + async def parse(self, content): + sel = Selector(content) + links = sel.xpath('//a/@href').getall() + normalized_links = [] + for link in links: + link = self.norm_link(link) + if link: + normalized_links.append(link) + self.urls.extend(normalized_links) + async def run(self): tasks = [] # create instance of Semaphore diff --git a/requirements.txt b/requirements.txt index e5fc25b..320207f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ -lxml -requests +aiohttp +asyncio +parsel \ No newline at end of file diff --git a/run.py b/run.py index 7997b13..c66b184 100644 --- a/run.py +++ b/run.py @@ -12,8 +12,9 @@ if __name__ == '__main__': logfile = 'errlog.log' # path to logfile oformat = 'xml' # output format outputfile = 'sitemap.xml' # path to output file + loop = asyncio.get_event_loop() - crawler = Crawler(url=url, logfile=logfile, oformat=oformat, outputfile=outputfile) - future = asyncio.ensure_future(crawler.crawl(echo=True)) + crawler: Crawler = Crawler(url=url) + future = asyncio.ensure_future(crawler.run()) loop.run_until_complete(future) diff --git a/setup.py b/setup.py index c6b0ecb..fa5dff1 100644 --- a/setup.py +++ b/setup.py @@ -3,6 +3,11 @@ from setuptools import find_packages, setup EXCLUDE_FROM_PACKAGES = ['tests',] +def get_requirements(): + requirements = [] + with open('requirements.txt', 'r') as df: + requirements = df.readlines() + return [requirement.strip() for requirement in requirements] def get_version(major=0, minor=0, build=0): return '%s.%s.%s' % (major, minor, build) @@ -28,11 +33,10 @@ setup( 'License :: OSI Approved :: BSD License', 'Operating System :: OS Independent', 'Programming Language :: Python', - 'Programming Language :: Python :: 2', - 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.7', 'Topic :: Software Development :: Libraries :: Python Modules', ], - install_requires=['lxml', 'requests'], - requires=['lxml', 'requests'] + install_requires=get_requirements(), + requires=get_requirements() ) \ No newline at end of file