some fisex

5 years ago · 26688aff8a
--- a/pysitemap/init.py
+++ b/pysitemap/init.py
@ -0,0 +1 @@
 from pysitemap.crawler import Crawler
--- a/pysitemap/crawler.py
+++ b/pysitemap/crawler.py
@ -3,25 +3,53 @@
 import time
 import asyncio
 from aiohttp import ClientSession
 from parsel import Selector
 from urllib.parse import urlparse, urlunparse

 class Knocker(object):
    def __init__(self, urls=None, sleep_time=.5):
        self.urls = urls or []

 class Crawler(object):
    def __init__(self, url, sleep_time=.5):

        self.urls = [url]
        scheme, netloc, path, params, query, fragment = urlparse(url)
        if not netloc:
            netloc, path = path, netloc
        url = urlunparse((scheme, netloc, "", params, "", fragment))
        self.base_url = url
        self.sleep_time = float(sleep_time)

    async def fetch(self, url, session):
        async with session.get(url) as response:
            await asyncio.sleep(self.sleep_time)
            status = response.status
            date = response.headers.get("DATE")
            print("{}:{} with status {}".format(date, response.url, status))
            return url, status
            return response.content

    async def bound_fetch(self, sem, url, session):
        # Getter function with semaphore.
        async with sem:
            await self.fetch(url, session)

    def norm_link(self, url:str):
        if url.startswith(self.base_url):
            return url
        elif url.startswith('//'):
            return "{scheme}{url}".format(
                scheme=self.base_url[:self.base_url.find(":")],
                url=url
            )
        elif url.startswith("/"):
            return self.base_url + url
        return None

    async def parse(self, content):
        sel = Selector(content)
        links = sel.xpath('//a/@href').getall()
        normalized_links = []
        for link in links:
            link = self.norm_link(link)
            if link:
                normalized_links.append(link)
        self.urls.extend(normalized_links)

    async def run(self):
        tasks = []
        # create instance of Semaphore
--- a/requirements.txt
+++ b/requirements.txt
@ -1,2 +1,3 @@
 lxml
 requests
 aiohttp
 asyncio
 parsel
--- a/run.py
+++ b/run.py
@ -12,8 +12,9 @@ if __name__ == '__main__':
    logfile = 'errlog.log'  # path to logfile
    oformat = 'xml'  # output format
    outputfile = 'sitemap.xml'  # path to output file

    loop = asyncio.get_event_loop()
    crawler = Crawler(url=url, logfile=logfile, oformat=oformat, outputfile=outputfile)
    future = asyncio.ensure_future(crawler.crawl(echo=True))
    crawler: Crawler = Crawler(url=url)
    future = asyncio.ensure_future(crawler.run())
    loop.run_until_complete(future)

--- a/setup.py
+++ b/setup.py
@ -3,6 +3,11 @@ from setuptools import find_packages, setup

 EXCLUDE_FROM_PACKAGES = ['tests',]

 def get_requirements():
    requirements = []
    with open('requirements.txt', 'r') as df:
        requirements = df.readlines()
    return [requirement.strip() for requirement in requirements]

 def get_version(major=0, minor=0, build=0):
    return '%s.%s.%s' % (major, minor, build)
@ -28,11 +33,10 @@ setup(
        'License :: OSI Approved :: BSD License',
        'Operating System :: OS Independent',
        'Programming Language :: Python',
        'Programming Language :: Python :: 2',
        'Programming Language :: Python :: 2.7',
        'Programming Language :: Python :: 3',
        'Programming Language :: Python :: 3.7',
        'Topic :: Software Development :: Libraries :: Python Modules',
    ],
    install_requires=['lxml', 'requests'],
    requires=['lxml', 'requests']
    install_requires=get_requirements(),
    requires=get_requirements()
 )