Browse Source

modified: .gitignore

modified:   LICENSE
modified:   NOTICE
modified:   README.md
modified:   composer.json
modified:   pysitemap/crawler.py
modified:   requirements.txt
modified:   run.py
modified:   setup.py
pysitemap-python-2.7
7 years ago
parent
commit
2c4d7c5a11
10 changed files with 1650 additions and 1650 deletions
  1. +61
    -61
      .gitignore
  2. +202
    -202
      LICENSE
  3. +15
    -15
      NOTICE
  4. +38
    -38
      README.md
  5. +20
    -20
      composer.json
  6. +153
    -153
      pysitemap/crawler.py
  7. +2
    -2
      requirements.txt
  8. +16
    -16
      run.py
  9. +36
    -36
      setup.py
  10. +1107
    -1107
      sitemap.xml

+ 61
- 61
.gitignore View File

@ -1,61 +1,61 @@
.idea/ .idea/
tests/ tests/
# Byte-compiled / optimized / DLL files
# Byte-compiled / optimized / DLL files __pycache__/
__pycache__/ *.py[cod]
*.py[cod] # C extensions
*.so
# C extensions # Distribution / packaging
*.so .Python
env/
# Distribution / packaging build/
.Python develop-eggs/
env/ dist/
build/ downloads/
develop-eggs/ eggs/
dist/ .eggs/
downloads/ lib/
eggs/ lib64/
.eggs/ parts/
lib/ sdist/
lib64/ var/
parts/ *.egg-info/
sdist/ .installed.cfg
var/ *.egg
*.egg-info/ # PyInstaller
.installed.cfg # Usually these files are written by a python script from a template
*.egg # before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
# PyInstaller *.spec
# Usually these files are written by a python script from a template # Installer logs
# before PyInstaller builds the exe, so as to inject date/other infos into it. pip-log.txt
*.manifest pip-delete-this-directory.txt
*.spec # Unit test / coverage reports
htmlcov/
# Installer logs .tox/
pip-log.txt .coverage
pip-delete-this-directory.txt .coverage.*
.cache
# Unit test / coverage reports nosetests.xml
htmlcov/ coverage.xml
.tox/ *,cover
.coverage # Translations
.coverage.* *.mo
.cache *.pot
nosetests.xml # Django stuff:
coverage.xml *.log
*,cover # Sphinx documentation
docs/_build/
# Translations # PyBuilder
*.mo target/
*.pot run.py
# Django stuff:
*.log
# Sphinx documentation
docs/_build/
# PyBuilder
target/
run.py

+ 202
- 202
LICENSE View File

@ -1,202 +1,202 @@
Apache License Apache License
Version 2.0, January 2004 Version 2.0, January 2004
http://www.apache.org/licenses/ http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
1. Definitions. and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
"License" shall mean the terms and conditions for use, reproduction, the copyright owner that is granting the License.
and distribution as defined by Sections 1 through 9 of this document. "Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
"Licensor" shall mean the copyright owner or entity authorized by control with that entity. For the purposes of this definition,
the copyright owner that is granting the License. "control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
"Legal Entity" shall mean the union of the acting entity and all otherwise, or (ii) ownership of fifty percent (50%) or more of the
other entities that control, are controlled by, or are under common outstanding shares, or (iii) beneficial ownership of such entity.
control with that entity. For the purposes of this definition, "You" (or "Your") shall mean an individual or Legal Entity
"control" means (i) the power, direct or indirect, to cause the exercising permissions granted by this License.
direction or management of such entity, whether by contract or "Source" form shall mean the preferred form for making modifications,
otherwise, or (ii) ownership of fifty percent (50%) or more of the including but not limited to software source code, documentation
outstanding shares, or (iii) beneficial ownership of such entity. source, and configuration files.
"Object" form shall mean any form resulting from mechanical
"You" (or "Your") shall mean an individual or Legal Entity transformation or translation of a Source form, including but
exercising permissions granted by this License. not limited to compiled object code, generated documentation,
and conversions to other media types.
"Source" form shall mean the preferred form for making modifications, "Work" shall mean the work of authorship, whether in Source or
including but not limited to software source code, documentation Object form, made available under the License, as indicated by a
source, and configuration files. copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Object" form shall mean any form resulting from mechanical "Derivative Works" shall mean any work, whether in Source or Object
transformation or translation of a Source form, including but form, that is based on (or derived from) the Work and for which the
not limited to compiled object code, generated documentation, editorial revisions, annotations, elaborations, or other modifications
and conversions to other media types. represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
"Work" shall mean the work of authorship, whether in Source or separable from, or merely link (or bind by name) to the interfaces of,
Object form, made available under the License, as indicated by a the Work and Derivative Works thereof.
copyright notice that is included in or attached to the work "Contribution" shall mean any work of authorship, including
(an example is provided in the Appendix below). the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
"Derivative Works" shall mean any work, whether in Source or Object submitted to Licensor for inclusion in the Work by the copyright owner
form, that is based on (or derived from) the Work and for which the or by an individual or Legal Entity authorized to submit on behalf of
editorial revisions, annotations, elaborations, or other modifications the copyright owner. For the purposes of this definition, "submitted"
represent, as a whole, an original work of authorship. For the purposes means any form of electronic, verbal, or written communication sent
of this License, Derivative Works shall not include works that remain to the Licensor or its representatives, including but not limited to
separable from, or merely link (or bind by name) to the interfaces of, communication on electronic mailing lists, source code control systems,
the Work and Derivative Works thereof. and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
"Contribution" shall mean any work of authorship, including excluding communication that is conspicuously marked or otherwise
the original version of the Work and any modifications or additions designated in writing by the copyright owner as "Not a Contribution."
to that Work or Derivative Works thereof, that is intentionally "Contributor" shall mean Licensor and any individual or Legal Entity
submitted to Licensor for inclusion in the Work by the copyright owner on behalf of whom a Contribution has been received by Licensor and
or by an individual or Legal Entity authorized to submit on behalf of subsequently incorporated within the Work.
the copyright owner. For the purposes of this definition, "submitted" 2. Grant of Copyright License. Subject to the terms and conditions of
means any form of electronic, verbal, or written communication sent this License, each Contributor hereby grants to You a perpetual,
to the Licensor or its representatives, including but not limited to worldwide, non-exclusive, no-charge, royalty-free, irrevocable
communication on electronic mailing lists, source code control systems, copyright license to reproduce, prepare Derivative Works of,
and issue tracking systems that are managed by, or on behalf of, the publicly display, publicly perform, sublicense, and distribute the
Licensor for the purpose of discussing and improving the Work, but Work and such Derivative Works in Source or Object form.
excluding communication that is conspicuously marked or otherwise 3. Grant of Patent License. Subject to the terms and conditions of
designated in writing by the copyright owner as "Not a Contribution." this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
"Contributor" shall mean Licensor and any individual or Legal Entity (except as stated in this section) patent license to make, have made,
on behalf of whom a Contribution has been received by Licensor and use, offer to sell, sell, import, and otherwise transfer the Work,
subsequently incorporated within the Work. where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
2. Grant of Copyright License. Subject to the terms and conditions of Contribution(s) alone or by combination of their Contribution(s)
this License, each Contributor hereby grants to You a perpetual, with the Work to which such Contribution(s) was submitted. If You
worldwide, non-exclusive, no-charge, royalty-free, irrevocable institute patent litigation against any entity (including a
copyright license to reproduce, prepare Derivative Works of, cross-claim or counterclaim in a lawsuit) alleging that the Work
publicly display, publicly perform, sublicense, and distribute the or a Contribution incorporated within the Work constitutes direct
Work and such Derivative Works in Source or Object form. or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
3. Grant of Patent License. Subject to the terms and conditions of as of the date such litigation is filed.
this License, each Contributor hereby grants to You a perpetual, 4. Redistribution. You may reproduce and distribute copies of the
worldwide, non-exclusive, no-charge, royalty-free, irrevocable Work or Derivative Works thereof in any medium, with or without
(except as stated in this section) patent license to make, have made, modifications, and in Source or Object form, provided that You
use, offer to sell, sell, import, and otherwise transfer the Work, meet the following conditions:
where such license applies only to those patent claims licensable (a) You must give any other recipients of the Work or
by such Contributor that are necessarily infringed by their Derivative Works a copy of this License; and
Contribution(s) alone or by combination of their Contribution(s) (b) You must cause any modified files to carry prominent notices
with the Work to which such Contribution(s) was submitted. If You stating that You changed the files; and
institute patent litigation against any entity (including a (c) You must retain, in the Source form of any Derivative Works
cross-claim or counterclaim in a lawsuit) alleging that the Work that You distribute, all copyright, patent, trademark, and
or a Contribution incorporated within the Work constitutes direct attribution notices from the Source form of the Work,
or contributory patent infringement, then any patent licenses excluding those notices that do not pertain to any part of
granted to You under this License for that Work shall terminate the Derivative Works; and
as of the date such litigation is filed. (d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
4. Redistribution. You may reproduce and distribute copies of the include a readable copy of the attribution notices contained
Work or Derivative Works thereof in any medium, with or without within such NOTICE file, excluding those notices that do not
modifications, and in Source or Object form, provided that You pertain to any part of the Derivative Works, in at least one
meet the following conditions: of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
(a) You must give any other recipients of the Work or documentation, if provided along with the Derivative Works; or,
Derivative Works a copy of this License; and within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
(b) You must cause any modified files to carry prominent notices of the NOTICE file are for informational purposes only and
stating that You changed the files; and do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
(c) You must retain, in the Source form of any Derivative Works or as an addendum to the NOTICE text from the Work, provided
that You distribute, all copyright, patent, trademark, and that such additional attribution notices cannot be construed
attribution notices from the Source form of the Work, as modifying the License.
excluding those notices that do not pertain to any part of You may add Your own copyright statement to Your modifications and
the Derivative Works; and may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
(d) If the Work includes a "NOTICE" text file as part of its for any such Derivative Works as a whole, provided Your use,
distribution, then any Derivative Works that You distribute must reproduction, and distribution of the Work otherwise complies with
include a readable copy of the attribution notices contained the conditions stated in this License.
within such NOTICE file, excluding those notices that do not 5. Submission of Contributions. Unless You explicitly state otherwise,
pertain to any part of the Derivative Works, in at least one any Contribution intentionally submitted for inclusion in the Work
of the following places: within a NOTICE text file distributed by You to the Licensor shall be under the terms and conditions of
as part of the Derivative Works; within the Source form or this License, without any additional terms or conditions.
documentation, if provided along with the Derivative Works; or, Notwithstanding the above, nothing herein shall supersede or modify
within a display generated by the Derivative Works, if and the terms of any separate license agreement you may have executed
wherever such third-party notices normally appear. The contents with Licensor regarding such Contributions.
of the NOTICE file are for informational purposes only and 6. Trademarks. This License does not grant permission to use the trade
do not modify the License. You may add Your own attribution names, trademarks, service marks, or product names of the Licensor,
notices within Derivative Works that You distribute, alongside except as required for reasonable and customary use in describing the
or as an addendum to the NOTICE text from the Work, provided origin of the Work and reproducing the content of the NOTICE file.
that such additional attribution notices cannot be construed 7. Disclaimer of Warranty. Unless required by applicable law or
as modifying the License. agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
You may add Your own copyright statement to Your modifications and WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
may provide additional or different license terms and conditions implied, including, without limitation, any warranties or conditions
for use, reproduction, or distribution of Your modifications, or of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
for any such Derivative Works as a whole, provided Your use, PARTICULAR PURPOSE. You are solely responsible for determining the
reproduction, and distribution of the Work otherwise complies with appropriateness of using or redistributing the Work and assume any
the conditions stated in this License. risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
5. Submission of Contributions. Unless You explicitly state otherwise, whether in tort (including negligence), contract, or otherwise,
any Contribution intentionally submitted for inclusion in the Work unless required by applicable law (such as deliberate and grossly
by You to the Licensor shall be under the terms and conditions of negligent acts) or agreed to in writing, shall any Contributor be
this License, without any additional terms or conditions. liable to You for damages, including any direct, indirect, special,
Notwithstanding the above, nothing herein shall supersede or modify incidental, or consequential damages of any character arising as a
the terms of any separate license agreement you may have executed result of this License or out of the use or inability to use the
with Licensor regarding such Contributions. Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
6. Trademarks. This License does not grant permission to use the trade other commercial damages or losses), even if such Contributor
names, trademarks, service marks, or product names of the Licensor, has been advised of the possibility of such damages.
except as required for reasonable and customary use in describing the 9. Accepting Warranty or Additional Liability. While redistributing
origin of the Work and reproducing the content of the NOTICE file. the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
7. Disclaimer of Warranty. Unless required by applicable law or or other liability obligations and/or rights consistent with this
agreed to in writing, Licensor provides the Work (and each License. However, in accepting such obligations, You may act only
Contributor provides its Contributions) on an "AS IS" BASIS, on Your own behalf and on Your sole responsibility, not on behalf
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or of any other Contributor, and only if You agree to indemnify,
implied, including, without limitation, any warranties or conditions defend, and hold each Contributor harmless for any liability
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A incurred by, or claims asserted against, such Contributor by reason
PARTICULAR PURPOSE. You are solely responsible for determining the of your accepting any such warranty or additional liability.
appropriateness of using or redistributing the Work and assume any END OF TERMS AND CONDITIONS
risks associated with Your exercise of permissions under this License. APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
8. Limitation of Liability. In no event and under no legal theory, boilerplate notice, with the fields enclosed by brackets "{}"
whether in tort (including negligence), contract, or otherwise, replaced with your own identifying information. (Don't include
unless required by applicable law (such as deliberate and grossly the brackets!) The text should be enclosed in the appropriate
negligent acts) or agreed to in writing, shall any Contributor be comment syntax for the file format. We also recommend that a
liable to You for damages, including any direct, indirect, special, file or class name and description of purpose be included on the
incidental, or consequential damages of any character arising as a same "printed page" as the copyright notice for easier
result of this License or out of the use or inability to use the identification within third-party archives.
Work (including but not limited to damages for loss of goodwill, Copyright {yyyy} {name of copyright owner}
work stoppage, computer failure or malfunction, or any and all Licensed under the Apache License, Version 2.0 (the "License");
other commercial damages or losses), even if such Contributor you may not use this file except in compliance with the License.
has been advised of the possibility of such damages. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
9. Accepting Warranty or Additional Liability. While redistributing Unless required by applicable law or agreed to in writing, software
the Work or Derivative Works thereof, You may choose to offer, distributed under the License is distributed on an "AS IS" BASIS,
and charge a fee for, acceptance of support, warranty, indemnity, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
or other liability obligations and/or rights consistent with this See the License for the specific language governing permissions and
License. However, in accepting such obligations, You may act only limitations under the License.
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "{}"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright {yyyy} {name of copyright owner}
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

+ 15
- 15
NOTICE View File

@ -1,15 +1,15 @@
Copyright 2015 Kamo Petrosyan Copyright 2015 Kamo Petrosyan
Licensed under the Apache License, Version 2.0 (the "License");
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.
you may not use this file except in compliance with the License. You may obtain a copy of the License at
You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
http://www.apache.org/licenses/LICENSE-2.0 distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
Unless required by applicable law or agreed to in writing, software See the License for the specific language governing permissions and
distributed under the License is distributed on an "AS IS" BASIS, limitations under the License.
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. https://github.com/Haikson/pysitemap
See the License for the specific language governing permissions and
limitations under the License.
https://github.com/Haikson/pysitemap

+ 38
- 38
README.md View File

@ -1,38 +1,38 @@
# pysitemap # pysitemap
Sitemap generator Sitemap generator
## installing
## installing pip install sitemap-generator
## Gevent
pip install sitemap-generator Sitemap-generator uses [gevent](http://www.gevent.org/) to implement multiprocessing. Install gevent:
pip install gevent
## Gevent ## example
import pysitemap
Sitemap-generator uses [gevent](http://www.gevent.org/) to implement multiprocessing. Install gevent: if __name__ == '__main__':
url = 'http://www.example.com/' # url from to crawl
pip install gevent logfile = 'errlog.log' # path to logfile
oformat = 'xml' # output format
## example crawl = pysitemap.Crawler(url=url, logfile=logfile, oformat=oformat)
crawl.crawl()
import pysitemap ## multiprocessing example
import pysitemap
if __name__ == '__main__':
if __name__ == '__main__': url = 'http://www.example.com/' # url from to crawl
url = 'http://www.example.com/' # url from to crawl logfile = 'errlog.log' # path to logfile
logfile = 'errlog.log' # path to logfile oformat = 'xml' # output format
oformat = 'xml' # output format crawl = pysitemap.Crawler(url=url, logfile=logfile, oformat=oformat)
crawl = pysitemap.Crawler(url=url, logfile=logfile, oformat=oformat) crawl.crawl(pool_size=10) # 10 parsing processes
crawl.crawl()
## multiprocessing example
import pysitemap
if __name__ == '__main__':
url = 'http://www.example.com/' # url from to crawl
logfile = 'errlog.log' # path to logfile
oformat = 'xml' # output format
crawl = pysitemap.Crawler(url=url, logfile=logfile, oformat=oformat)
crawl.crawl(pool_size=10) # 10 parsing processes

+ 20
- 20
composer.json View File

@ -1,20 +1,20 @@
{ {
"name": "haikson/sitemap-generator", "name": "haikson/sitemap-generator",
"type": "library", "type": "library",
"description": "Sitemap crawler and generator class", "description": "Sitemap crawler and generator class",
"keywords": ["sitemap","crawler"], "keywords": ["sitemap","crawler"],
"homepage": "https://github.com/Haikson/sitemap-generator", "homepage": "https://github.com/Haikson/sitemap-generator",
"license": "Apache License", "license": "Apache License",
"authors": [ "authors": [
{ {
"name": "Kamo Petrosyan", "name": "Kamo Petrosyan",
"email": "kamo@haikson.com", "email": "kamo@haikson.com",
"homepage": "http://www.haikson.com", "homepage": "http://www.haikson.com",
"role": "Developer" "role": "Developer"
} }
], ],
"require": { "require": {
"beautifulsoup4": ">=4.4.0", "beautifulsoup4": ">=4.4.0",
"mechanize": ">=0.2.5" "mechanize": ">=0.2.5"
} }
} }

+ 153
- 153
pysitemap/crawler.py View File

@ -1,153 +1,153 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import __future__ import __future__
import sys import sys
import urlparse import urlparse
import requests import requests
from lxml import html from lxml import html
import re import re
import time import time
try: try:
import sys import sys
if 'threading' in sys.modules: if 'threading' in sys.modules:
del sys.modules['threading'] del sys.modules['threading']
print('threading module loaded before patching!') print('threading module loaded before patching!')
print('threading module deleted from sys.modules!\n') print('threading module deleted from sys.modules!\n')
from gevent import monkey, pool from gevent import monkey, pool
monkey.patch_all() monkey.patch_all()
gevent_installed = True gevent_installed = True
except: except:
print("Gevent does not installed. Parsing process will be slower.") print("Gevent does not installed. Parsing process will be slower.")
gevent_installed = False gevent_installed = False
class Crawler:
def __init__(self, url, outputfile='sitemap.xml', logfile='error.log', oformat='xml'):
class Crawler: self.url = url
def __init__(self, url, outputfile='sitemap.xml', logfile='error.log', oformat='xml'): self.logfile = open(logfile, 'a')
self.url = url self.oformat = oformat
self.logfile = open(logfile, 'a') self.outputfile = outputfile
self.oformat = oformat # create lists for the urls in que and visited urls
self.outputfile = outputfile self.urls = set([url])
self.visited = set([url])
# create lists for the urls in que and visited urls self.exts = ['htm', 'php']
self.urls = set([url]) self.allowed_regex = '\.((?!htm)(?!php)\w+)$'
self.visited = set([url]) self.errors = {'404': []}
self.exts = ['htm', 'php'] def set_exts(self, exts):
self.allowed_regex = '\.((?!htm)(?!php)\w+)$' self.exts = exts
self.errors = {'404': []} def allow_regex(self, regex=None):
if regex is not None:
def set_exts(self, exts): self.allowed_regex = regex
self.exts = exts else:
allowed_regex = ''
def allow_regex(self, regex=None): for ext in self.exts:
if regex is not None: allowed_regex += '(!{})'.format(ext)
self.allowed_regex = regex self.allowed_regex = '\.({}\w+)$'.format(allowed_regex)
else: def crawl(self, echo=False, pool_size=1):
allowed_regex = '' # sys.stdout.write('echo attribute deprecated and will be removed in future')
for ext in self.exts: self.echo = echo
allowed_regex += '(!{})'.format(ext) self.regex = re.compile(self.allowed_regex)
self.allowed_regex = '\.({}\w+)$'.format(allowed_regex) print('Parsing pages')
if gevent_installed and pool_size >= 1:
def crawl(self, echo=False, pool_size=1): self.pool = pool.Pool(pool_size)
# sys.stdout.write('echo attribute deprecated and will be removed in future') self.pool.spawn(self.parse_gevent)
self.echo = echo self.pool.join()
self.regex = re.compile(self.allowed_regex) else:
while len(self.urls) > 0:
print('Parsing pages') self.parse()
if gevent_installed and pool_size >= 1: if self.oformat == 'xml':
self.pool = pool.Pool(pool_size) self.write_xml()
self.pool.spawn(self.parse_gevent) elif self.oformat == 'txt':
self.pool.join() self.write_txt()
else: with open('errors.txt', 'w') as err_file:
while len(self.urls) > 0: for key, val in self.errors.items():
self.parse() err_file.write(u'\n\nError {}\n\n'.format(key))
if self.oformat == 'xml': err_file.write(u'\n'.join(set(val)))
self.write_xml() def parse_gevent(self):
elif self.oformat == 'txt': self.parse()
self.write_txt() while len(self.urls) > 0 and not self.pool.full():
with open('errors.txt', 'w') as err_file: self.pool.spawn(self.parse_gevent)
for key, val in self.errors.items(): def parse(self):
err_file.write(u'\n\nError {}\n\n'.format(key)) if self.echo:
err_file.write(u'\n'.join(set(val))) n_visited, n_urls, n_pool = len(self.visited), len(self.urls), len(self.pool)
status = (
def parse_gevent(self): '{} pages parsed :: {} pages in the queue'.format(n_visited, n_urls),
self.parse() '{} pages parsed :: {} parsing processes :: {} pages in the queue'.format(n_visited, n_pool, n_urls)
while len(self.urls) > 0 and not self.pool.full(): )
self.pool.spawn(self.parse_gevent) print(status[int(gevent_installed)])
if not self.urls:
def parse(self): return
if self.echo: else:
n_visited, n_urls, n_pool = len(self.visited), len(self.urls), len(self.pool) url = self.urls.pop()
status = ( try:
'{} pages parsed :: {} pages in the queue'.format(n_visited, n_urls), response = requests.get(url)
'{} pages parsed :: {} parsing processes :: {} pages in the queue'.format(n_visited, n_pool, n_urls) # if status code is not 404, then add url in seld.errors dictionary
) if response.status_code != 200:
print(status[int(gevent_installed)]) if self.errors.get(str(response.status_code), False):
self.errors[str(response.status_code)].extend([url])
if not self.urls: else:
return self.errors.update({str(response.status_code): [url]})
else: self.errlog("Error {} at url {}".format(response.status_code, url))
url = self.urls.pop() return
try: tree = html.fromstring(response.text)
response = requests.get(url) for link_tag in tree.findall('.//a'):
# if status code is not 404, then add url in seld.errors dictionary link = link_tag.attrib.get('href', '')
if response.status_code != 200: newurl = urlparse.urljoin(self.url, link)
if self.errors.get(str(response.status_code), False): # print(newurl)
self.errors[str(response.status_code)].extend([url]) if self.is_valid(newurl):
else: self.visited.update([newurl])
self.errors.update({str(response.status_code): [url]}) self.urls.update([newurl])
self.errlog("Error {} at url {}".format(response.status_code, url)) except Exception, e:
return self.errlog(e.message)
def is_valid(self, url):
tree = html.fromstring(response.text) if '#' in url:
for link_tag in tree.findall('.//a'): url = url[:url.find('#')]
link = link_tag.attrib.get('href', '') if url in self.visited:
newurl = urlparse.urljoin(self.url, link) return False
# print(newurl) if self.url not in url:
if self.is_valid(newurl): return False
self.visited.update([newurl]) if re.search(self.regex, url):
self.urls.update([newurl]) return False
except Exception, e: return True
self.errlog(e.message) def errlog(self, msg):
self.logfile.write(msg)
def is_valid(self, url): self.logfile.write('\n')
if '#' in url: def write_xml(self):
url = url[:url.find('#')] of = open(self.outputfile, 'w')
if url in self.visited: of.write('<?xml version="1.0" encoding="utf-8"?>\n')
return False of.write('<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd">\n')
if self.url not in url: url_str = '<url><loc>{}</loc></url>\n'
return False while self.visited:
if re.search(self.regex, url): of.write(url_str.format(self.visited.pop()))
return False of.write('</urlset>')
return True of.close()
def write_txt(self):
def errlog(self, msg): of = open(self.outputfile, 'w')
self.logfile.write(msg) url_str = '{}\n'
self.logfile.write('\n') while self.visited:
of.write(url_str.format(self.visited.pop()))
def write_xml(self): of.close()
of = open(self.outputfile, 'w') def show_progress(self, count, total, status=''):
of.write('<?xml version="1.0" encoding="utf-8"?>\n') bar_len = 60
of.write('<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd">\n') filled_len = int(round(bar_len * count / float(total)))
url_str = '<url><loc>{}</loc></url>\n' percents = round(100.0 * count / float(total), 1)
while self.visited: bar = '=' * filled_len + '-' * (bar_len - filled_len)
of.write(url_str.format(self.visited.pop())) sys.stdout.write('[%s] %s%s ...%s\r' % (bar, percents, '%', status))
sys.stdout.flush() # As suggested by Rom Ruben (see: http://stackoverflow.com/questions/3173320/text-progress-bar-in-the-console/27871113#comment50529068_27871113)
of.write('</urlset>') time.sleep(0.5)
of.close()
def write_txt(self):
of = open(self.outputfile, 'w')
url_str = '{}\n'
while self.visited:
of.write(url_str.format(self.visited.pop()))
of.close()
def show_progress(self, count, total, status=''):
bar_len = 60
filled_len = int(round(bar_len * count / float(total)))
percents = round(100.0 * count / float(total), 1)
bar = '=' * filled_len + '-' * (bar_len - filled_len)
sys.stdout.write('[%s] %s%s ...%s\r' % (bar, percents, '%', status))
sys.stdout.flush() # As suggested by Rom Ruben (see: http://stackoverflow.com/questions/3173320/text-progress-bar-in-the-console/27871113#comment50529068_27871113)
time.sleep(0.5)

+ 2
- 2
requirements.txt View File

@ -1,2 +1,2 @@
lxml lxml
requests requests

+ 16
- 16
run.py View File

@ -1,16 +1,16 @@
import pysitemap import pysitemap
"""
""" Example script
Example script Uses gevent to implement multiprocessing if Gevent installed
Uses gevent to implement multiprocessing if Gevent installed To install gevent:
To install gevent: $ pip install gevent
$ pip install gevent """
""" if __name__ == '__main__':
url = 'http://www.example.com/' # url from to crawl
if __name__ == '__main__': logfile = 'errlog.log' # path to logfile
url = 'http://www.example.com/' # url from to crawl oformat = 'xml' # output format
logfile = 'errlog.log' # path to logfile outputfile = 'sitemap.xml' # path to output file
oformat = 'xml' # output format crawl = pysitemap.Crawler(url=url, logfile=logfile, oformat=oformat, outputfile=outputfile)
outputfile = 'sitemap.xml' # path to output file crawl.crawl(pool_size=20)
crawl = pysitemap.Crawler(url=url, logfile=logfile, oformat=oformat, outputfile=outputfile)
crawl.crawl(pool_size=20)

+ 36
- 36
setup.py View File

@ -1,37 +1,37 @@
from distutils.core import setup from distutils.core import setup
from setuptools import find_packages, setup from setuptools import find_packages, setup
EXCLUDE_FROM_PACKAGES = ['tests',]
EXCLUDE_FROM_PACKAGES = ['tests',] def get_version(major=0, minor=0, build=0):
return '%s.%s.%s' % (major, minor, build)
setup(
def get_version(major=0, minor=0, build=0): name='sitemap-generator',
return '%s.%s.%s' % (major, minor, build) version=get_version(
major=0,
minor=4,
setup( build=4,
name='sitemap-generator', ),
version=get_version( packages=find_packages(exclude=EXCLUDE_FROM_PACKAGES),
major=0, include_package_data=True,
minor=4, url='https://github.com/Haikson/sitemap-generator',
build=4, license='GPL3',
), author='Kamo Petrosyan',
packages=find_packages(exclude=EXCLUDE_FROM_PACKAGES), author_email='kamo@haikson.com',
include_package_data=True, description='web crawler and sitemap generator.',
url='https://github.com/Haikson/sitemap-generator', classifiers=[
license='GPL3', 'Environment :: Web Environment',
author='Kamo Petrosyan', 'Intended Audience :: Developers',
author_email='kamo@haikson.com', 'License :: OSI Approved :: BSD License',
description='web crawler and sitemap generator.', 'Operating System :: OS Independent',
classifiers=[ 'Programming Language :: Python',
'Environment :: Web Environment', 'Programming Language :: Python :: 2',
'Intended Audience :: Developers', 'Programming Language :: Python :: 2.7',
'License :: OSI Approved :: BSD License', 'Topic :: Software Development :: Libraries :: Python Modules',
'Operating System :: OS Independent', ],
'Programming Language :: Python', install_requires=['lxml', 'requests'],
'Programming Language :: Python :: 2', requires=['lxml', 'requests']
'Programming Language :: Python :: 2.7',
'Topic :: Software Development :: Libraries :: Python Modules',
],
install_requires=['lxml', 'requests'],
requires=['lxml', 'requests']
) )

+ 1107
- 1107
sitemap.xml
File diff suppressed because it is too large
View File


|||||||
|||||||
xxxxxxxxxx
 
000:0
x
 
000:0
Loading…
Cancel
Save