Browse Source

some fisex

from-knocker
Kamo Petrosyan 4 years ago
parent
commit
26688aff8a
5 changed files with 50 additions and 15 deletions
  1. +1
    -0
      pysitemap/__init__.py
  2. +35
    -7
      pysitemap/crawler.py
  3. +3
    -2
      requirements.txt
  4. +3
    -2
      run.py
  5. +8
    -4
      setup.py

+ 1
- 0
pysitemap/__init__.py View File

@ -0,0 +1 @@
from pysitemap.crawler import Crawler

+ 35
- 7
pysitemap/crawler.py View File

@ -3,25 +3,53 @@
import time import time
import asyncio import asyncio
from aiohttp import ClientSession from aiohttp import ClientSession
from parsel import Selector
from urllib.parse import urlparse, urlunparse
class Knocker(object):
def __init__(self, urls=None, sleep_time=.5):
self.urls = urls or []
class Crawler(object):
def __init__(self, url, sleep_time=.5):
self.urls = [url]
scheme, netloc, path, params, query, fragment = urlparse(url)
if not netloc:
netloc, path = path, netloc
url = urlunparse((scheme, netloc, "", params, "", fragment))
self.base_url = url
self.sleep_time = float(sleep_time) self.sleep_time = float(sleep_time)
async def fetch(self, url, session): async def fetch(self, url, session):
async with session.get(url) as response: async with session.get(url) as response:
await asyncio.sleep(self.sleep_time) await asyncio.sleep(self.sleep_time)
status = response.status
date = response.headers.get("DATE")
print("{}:{} with status {}".format(date, response.url, status))
return url, status
return response.content
async def bound_fetch(self, sem, url, session): async def bound_fetch(self, sem, url, session):
# Getter function with semaphore. # Getter function with semaphore.
async with sem: async with sem:
await self.fetch(url, session) await self.fetch(url, session)
def norm_link(self, url:str):
if url.startswith(self.base_url):
return url
elif url.startswith('//'):
return "{scheme}{url}".format(
scheme=self.base_url[:self.base_url.find(":")],
url=url
)
elif url.startswith("/"):
return self.base_url + url
return None
async def parse(self, content):
sel = Selector(content)
links = sel.xpath('//a/@href').getall()
normalized_links = []
for link in links:
link = self.norm_link(link)
if link:
normalized_links.append(link)
self.urls.extend(normalized_links)
async def run(self): async def run(self):
tasks = [] tasks = []
# create instance of Semaphore # create instance of Semaphore


+ 3
- 2
requirements.txt View File

@ -1,2 +1,3 @@
lxml
requests
aiohttp
asyncio
parsel

+ 3
- 2
run.py View File

@ -12,8 +12,9 @@ if __name__ == '__main__':
logfile = 'errlog.log' # path to logfile logfile = 'errlog.log' # path to logfile
oformat = 'xml' # output format oformat = 'xml' # output format
outputfile = 'sitemap.xml' # path to output file outputfile = 'sitemap.xml' # path to output file
loop = asyncio.get_event_loop() loop = asyncio.get_event_loop()
crawler = Crawler(url=url, logfile=logfile, oformat=oformat, outputfile=outputfile)
future = asyncio.ensure_future(crawler.crawl(echo=True))
crawler: Crawler = Crawler(url=url)
future = asyncio.ensure_future(crawler.run())
loop.run_until_complete(future) loop.run_until_complete(future)

+ 8
- 4
setup.py View File

@ -3,6 +3,11 @@ from setuptools import find_packages, setup
EXCLUDE_FROM_PACKAGES = ['tests',] EXCLUDE_FROM_PACKAGES = ['tests',]
def get_requirements():
requirements = []
with open('requirements.txt', 'r') as df:
requirements = df.readlines()
return [requirement.strip() for requirement in requirements]
def get_version(major=0, minor=0, build=0): def get_version(major=0, minor=0, build=0):
return '%s.%s.%s' % (major, minor, build) return '%s.%s.%s' % (major, minor, build)
@ -28,11 +33,10 @@ setup(
'License :: OSI Approved :: BSD License', 'License :: OSI Approved :: BSD License',
'Operating System :: OS Independent', 'Operating System :: OS Independent',
'Programming Language :: Python', 'Programming Language :: Python',
'Programming Language :: Python :: 2',
'Programming Language :: Python :: 2.7',
'Programming Language :: Python :: 3', 'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.7',
'Topic :: Software Development :: Libraries :: Python Modules', 'Topic :: Software Development :: Libraries :: Python Modules',
], ],
install_requires=['lxml', 'requests'],
requires=['lxml', 'requests']
install_requires=get_requirements(),
requires=get_requirements()
) )

Loading…
Cancel
Save