Browse Source

Code clean-up: remove redundant whitespaces

master
Pekka Helenius 3 years ago
parent
commit
3df3cb660d
1 changed files with 107 additions and 108 deletions
  1. +107
    -108
      code/url-analyzer.py

+ 107
- 108
code/url-analyzer.py View File

@ -96,21 +96,21 @@ class json_url_data(object):
Returns a requests.models.Response object.
"""
def set_session(self, url, method='get', redirects=True):
# HTTP response status codes 1XX, 2XX and 3XX are OK
# Treat other codes as errors
sc = re.compile(r"^[123]{1}[0-9]{2}")
sleep(sleep_interval_between_requests)
try:
session = requests.Session()
response = session.request(method, url, headers=request_headers, allow_redirects=redirects)
if not sc.match(str(response.status_code)):
raise Exception("Error: got invalid response status from the web server")
return response
except:
raise Exception("Error: HTTP session could not be established. URL: '" + url + "' (method: " + method + ")") from None
@ -121,7 +121,7 @@ class json_url_data(object):
Returns a bs4.BeautifulSoup object.
"""
def get_html_data(self, url):
try:
data = bs(self.set_session(url).content, 'html.parser')
return data
@ -135,15 +135,15 @@ class json_url_data(object):
Returns a list object.
"""
def get_url_redirects(self, url):
response = self.set_session(url)
list_data = []
if response.history:
for r in response.history:
list_data.append({'redirect_url': r.url, 'status': r.status_code})
return list_data
######################################
@ -153,9 +153,9 @@ class json_url_data(object):
Returns a string object.
"""
def get_webpage_title(self, url):
html_data = self.get_html_data(url)
title = html_data.title.string
return title
@ -175,7 +175,7 @@ class json_url_data(object):
"""
def get_domain_name(self, url):
domain_name = self.get_whois_data(url).domain_name
if type(domain_name) is list:
return domain_name[0].lower()
else:
@ -184,29 +184,29 @@ class json_url_data(object):
######################################
"""
Get initial and final URLs
Compare whether the final (destination) URL
matches with the initial URL in a request.
Returns a dict object.
"""
def get_startfinal_urls(self, url):
response = self.set_session(url)
end_url = response.url
start_match = False
final_match = False
# dr = re.compile(r"^([a-z]+://)?([^/]+)")
# dr_group_lastindex = dr.match(url).lastindex
# domain_name = dr.match(url).group(dr_group_lastindex)
domain_name = self.get_domain_name(url)
if re.search(domain_name, end_url):
final_match = True
dict_data = {
'startfinal_urls': {
'start_url': {
@ -217,13 +217,13 @@ class json_url_data(object):
}
}
}
return dict_data
######################################
"""
Get domain registrar
Returns a dict object.
"""
def get_domain_registrar(self, url):
@ -235,21 +235,21 @@ class json_url_data(object):
Do comparison between the domain name, extracted
from WHOIS domain data and contents of a title HTML
element, extracted from HTML data based on a given URL.
Returns a dict object.
"""
def get_domain_title_match(self, url):
domain_name = self.get_domain_name(url)
title = self.get_webpage_title(url)
# If is string:
if type(domain_name) is str:
if re.search(domain_name, title, re.IGNORECASE):
match = True
else:
match = False
# If is list:
elif type(domain_name) is list:
for d in domain_name:
@ -260,49 +260,49 @@ class json_url_data(object):
match = False
else:
match = False
dict_data = {
'webpage_title': title,
'domain_in_webpage_title': match
}
return dict_data
######################################
"""
Get a single timestamp from given data
Two scenarios are considered: dates argument is either
a list or a string. If it is a list, then we need
to decide which date value to extract.
Returns a date object.
"""
def get_single_date(self, dates, newest=False):
dates_epoch = []
if type(dates) is list:
for d in dates:
dates_epoch.append(d.timestamp())
else:
dates_epoch.append(dates.timestamp())
return datetime.fromtimestamp(sorted(dates_epoch, reverse=newest)[0])
######################################
"""
Get domain time information based on WHOIS domain data.
Returns a dict object.
"""
def get_domain_timeinfo(self, url):
whois_data = self.get_whois_data(url)
domain_creation_date = self.get_single_date(whois_data.creation_date, newest = False)
domain_updated_date = self.get_single_date(whois_data.updated_date, newest = False)
domain_expiration_date = self.get_single_date(whois_data.expiration_date, newest = False)
dict_data = {
'domain_timestamps':
{
@ -311,25 +311,25 @@ class json_url_data(object):
'expires': domain_expiration_date.strftime(dateformat)
}
}
return dict_data
######################################
"""
Get domain time information based on WHOIS domain data,
relative to the current date (UTC time).
Returns a dict object.
"""
def get_domain_timeinfo_relative(self, url):
date_now = datetime.utcnow()
whois_data = self.get_whois_data(url)
domain_creation_date = self.get_single_date(whois_data.creation_date, newest = False)
domain_updated_date = self.get_single_date(whois_data.updated_date, newest = False)
domain_expiration_date = self.get_single_date(whois_data.expiration_date, newest = False)
dict_data = {
'domain_timestamps_relative':
{
@ -339,7 +339,7 @@ class json_url_data(object):
'expires_days_left': (domain_expiration_date - date_now).days
}
}
return dict_data
######################################
@ -348,15 +348,15 @@ class json_url_data(object):
'../foo/bar/'
'/foo/../../bar/,
'https://foo.bar/foo/../'
etc.
Returns a boolean object.
"""
def is_multidot_url(self, url):
multidot = re.compile(r".*[.]{2}/.*")
if multidot.match(url):
return True
return False
@ -364,13 +364,13 @@ class json_url_data(object):
######################################
"""
Get HTML element data from HTML data contents.
Two fetching methods are supported:
- A) use only HTML element/tag name and extract raw contents of
these tags
- B) use both HTML element/tag name and more fine-grained
inner attribute name to determine which HTML elements are extracted
Special case - URL link references:
- attributes 'href' or 'src' are considered as link referrals and
they are handled in a special way
@ -378,55 +378,55 @@ class json_url_data(object):
(patterns: '/', '#', '../' and '/<anything>')
- B) link referrals to external domains are placed in 'ext_refs' list
(patterns such as 'https://foo.bar.dot/fancysite' etc.)
- Both A) and B) link categories have 'normal' and 'multidot' subcategories
- normal links do not contain pattern '../'
- multidot links contain '../' pattern
Returns a dict object.
"""
def get_tag_data(self, url, tag, attribute=None):
html_data = self.get_html_data(url)
domain_name = self.get_domain_name(url)
data = []
if attribute != None:
for d in html_data.find_all(tag):
# Ignore the HTML tag if it does not contain our attribute
if d.get(attribute) != None:
data.append(d.get(attribute))
if attribute == 'href' or attribute == 'src':
self_refs = { 'normal': [], 'multidot': []}
ext_refs = { 'normal': [], 'multidot': []}
# Syntax: '#<anything>', '/<anything>', '../<anything>'
rs = re.compile(r"^[/#]|^[.]{2}/.*")
# Syntax: '<text>:<text>/'
rd = re.compile(r"^[a-z]+:[a-z]+/")
# Syntax examples:
# 'http://foo.bar/', 'https://foo.bar/, 'foo.bar/', 'https://virus.foo.bar/'
rl = re.compile(r"^([a-z]+://)?([^/]*" + domain_name + "/)")
for s in data:
# Ignore mailto links
if re.match("^mailto:", s): continue
if rs.match(s) or rl.match(s) or rd.match(s):
if self.is_multidot_url(s):
self_refs['multidot'].append(s)
else:
self_refs['normal'].append(s)
else:
if self.is_multidot_url(s):
try:
ext_refs['multidot'].append({'url': s, 'registrar': self.get_whois_data(s).registrar })
@ -440,31 +440,31 @@ class json_url_data(object):
except:
ext_refs['normal'].append({'url': s, 'registrar': None })
pass
data = None
dict_data = {
tag: {
attribute + '_ext': (ext_refs),
attribute + '_self': (self_refs)
}
}
else:
dict_data = {
tag: {
attribute: (data)
}
}
else:
for d in html_data.find_all(tag):
data.append(d.prettify())
dict_data = {
tag: (data)
}
return dict_data
######################################
@ -473,21 +473,21 @@ class json_url_data(object):
the webpage itself?
"""
def get_registrar_count(self, registrar, urls):
i = 0
for u in urls:
for k,v in u.items():
if k == 'registrar' and v == registrar:
i += 1
o = len(urls) - i
dict_data = {
'same_registrar_count': i,
'other_registrar_count': o
}
return dict_data
######################################
@ -495,9 +495,9 @@ class json_url_data(object):
"""
Get values existing in a dict object,
based on a known key string.
Returns a list object.
TODO: Major re-work for the fetch function
TODO: Support for more sophisticated JSON key string filtering
@ -563,7 +563,7 @@ class json_url_data(object):
if flatten == True:
for u in get_data_extract(data_extract):
flat_data.append(u)
return flat_data
else:
return data_extract
@ -573,21 +573,21 @@ class json_url_data(object):
Compile URL related data.
"""
def get_url_data(self, url):
# Dict object for simple, non-nested data
data_simple = {}
# Pre-defined dict object for specific data sets
webpage_data = {}
startfinal_url = self.get_startfinal_urls(url)
redirect_url = self.get_url_redirects(url)
domain_registrar = self.get_domain_registrar(url)
domaintitle_match = self.get_domain_title_match(url)
domain_time_relative = self.get_domain_timeinfo_relative(url)
domain_time = self.get_domain_timeinfo(url)
html_element_iframe = self.get_tag_data(url, 'iframe')
html_element_a_href = self.get_tag_data(url, 'a', link_refs['a'])
html_element_img_src = self.get_tag_data(url, 'img', link_refs['img'])
@ -597,53 +597,53 @@ class json_url_data(object):
'iframes_count':
len(self.json_fetcher(html_element_iframe, 'iframe').get_data())
}
multidot_urls_count = {
'multidot_url_count':
len(self.json_fetcher(html_element_a_href, 'multidot').get_data()) + len(self.json_fetcher(html_element_img_src, 'multidot').get_data()) + len(self.json_fetcher(html_element_script_src, 'multidot').get_data())
}
###################
def get_total_registrars():
same_registrar_counts = 0
other_registrar_counts = 0
for k,v in link_refs.items():
html_element = self.get_tag_data(url, k, v)
same_registrar_counts += self.get_registrar_count(
domain_registrar['domain_registrar'],
html_element[k][v + '_ext']['normal']
)['same_registrar_count']
other_registrar_counts += self.get_registrar_count(
domain_registrar['domain_registrar'],
html_element[k][v + '_ext']['normal']
)['other_registrar_count']
registrar_counts = {
'same_registrar_count': same_registrar_counts,
'other_registrar_count': other_registrar_counts
}
return registrar_counts
# Avoid unnecessary nesting of the following data
data_simple.update(domain_registrar)
data_simple.update(domaintitle_match)
data_simple.update(iframes_count)
data_simple.update(multidot_urls_count)
data_simple.update(get_total_registrars())
url_data = dict({
url: [
data_simple,
startfinal_url,
{'redirects': redirect_url},
domain_time_relative,
domain_time,
{'webpage_data': [
html_element_iframe,
html_element_a_href,
@ -653,7 +653,7 @@ class json_url_data(object):
}
]
})
return url_data
@ -667,11 +667,11 @@ class write_operations(object):
"""
Set JSON file name, append number suffix
# if file exists already.
Returns file name path.
"""
def set_filename(self):
c = 0
while True:
if os.path.exists(self.filename):
@ -689,7 +689,7 @@ class write_operations(object):
Append to a JSON file.
"""
def write_to_file(self, data):
try:
json_file = open(self.filename, "a")
json_file.write(data)
@ -762,7 +762,7 @@ class data_visualization(object):
'local_urls': link_count(unique_refs, '_self'),
'external_urls': link_count(unique_refs, '_ext')
}
return data
def get_registrars(self):
@ -778,7 +778,7 @@ class data_visualization(object):
return registrars
def get_registrar_count_summary(self):
domain_counter = dict(Counter(self.get_registrars()))
data = {'fetched_domains': domain_counter, 'url_domain_registrar': self.domain_registrar }
return data
@ -807,7 +807,7 @@ if __name__ == '__main__':
# Get URLs from an available JSON data
for key_url in json_data.keys():
print("Generating statistics: %s" % key_url)
fig = plt.figure()
@ -817,19 +817,19 @@ if __name__ == '__main__':
# 'figure.constrained_layout.use': True
}
plt.rcParams.update(fig_params)
domain_string = url_str_pattern.split(key_url)[2].replace('.','')
summary = data_visualization(key_url, json_data)
summary_registrars = summary.get_registrar_count_summary()['fetched_domains']
x_r = list(summary_registrars.keys())
y_r = list(summary_registrars.values())
# Show bar values
for index,data in enumerate(y_r):
plt.text(x=index, y=data+0.5, s=data, fontdict=dict(fontsize=8))
title_r = "Domains associated with HTML URL data (" + key_url + ")"
xlabel_r = "Fetched domains"
ylabel_r = "Domain count"
@ -845,18 +845,17 @@ if __name__ == '__main__':
plt.show()
#fig_u = plt.figure()
#summary_urls = summary.get_urls_count_summary()
#x_u = list(summary_urls.keys())
#y_u = list(summary_urls.values())
#title_u = "Local and external URL references (" + key_url + ")"
#xlabel_u = "Fetched URLs"
#ylabel_u = "URL count"
#plt.bar(x_u, y_u, color="blue", edgecolor='black')
#plt.title(title_u)
#plt.xlabel(xlabel_u)
#plt.ylabel(ylabel_u)
#plt.show()

Loading…
Cancel
Save