Browse Source

Code clean-up: remove redundant whitespaces

master
Pekka Helenius 3 years ago
parent
commit
3df3cb660d
1 changed files with 107 additions and 108 deletions
  1. +107
    -108
      code/url-analyzer.py

+ 107
- 108
code/url-analyzer.py View File

@ -96,21 +96,21 @@ class json_url_data(object):
Returns a requests.models.Response object. Returns a requests.models.Response object.
""" """
def set_session(self, url, method='get', redirects=True): def set_session(self, url, method='get', redirects=True):
# HTTP response status codes 1XX, 2XX and 3XX are OK # HTTP response status codes 1XX, 2XX and 3XX are OK
# Treat other codes as errors # Treat other codes as errors
sc = re.compile(r"^[123]{1}[0-9]{2}") sc = re.compile(r"^[123]{1}[0-9]{2}")
sleep(sleep_interval_between_requests) sleep(sleep_interval_between_requests)
try: try:
session = requests.Session() session = requests.Session()
response = session.request(method, url, headers=request_headers, allow_redirects=redirects) response = session.request(method, url, headers=request_headers, allow_redirects=redirects)
if not sc.match(str(response.status_code)): if not sc.match(str(response.status_code)):
raise Exception("Error: got invalid response status from the web server") raise Exception("Error: got invalid response status from the web server")
return response return response
except: except:
raise Exception("Error: HTTP session could not be established. URL: '" + url + "' (method: " + method + ")") from None raise Exception("Error: HTTP session could not be established. URL: '" + url + "' (method: " + method + ")") from None
@ -121,7 +121,7 @@ class json_url_data(object):
Returns a bs4.BeautifulSoup object. Returns a bs4.BeautifulSoup object.
""" """
def get_html_data(self, url): def get_html_data(self, url):
try: try:
data = bs(self.set_session(url).content, 'html.parser') data = bs(self.set_session(url).content, 'html.parser')
return data return data
@ -135,15 +135,15 @@ class json_url_data(object):
Returns a list object. Returns a list object.
""" """
def get_url_redirects(self, url): def get_url_redirects(self, url):
response = self.set_session(url) response = self.set_session(url)
list_data = [] list_data = []
if response.history: if response.history:
for r in response.history: for r in response.history:
list_data.append({'redirect_url': r.url, 'status': r.status_code}) list_data.append({'redirect_url': r.url, 'status': r.status_code})
return list_data return list_data
###################################### ######################################
@ -153,9 +153,9 @@ class json_url_data(object):
Returns a string object. Returns a string object.
""" """
def get_webpage_title(self, url): def get_webpage_title(self, url):
html_data = self.get_html_data(url) html_data = self.get_html_data(url)
title = html_data.title.string title = html_data.title.string
return title return title
@ -175,7 +175,7 @@ class json_url_data(object):
""" """
def get_domain_name(self, url): def get_domain_name(self, url):
domain_name = self.get_whois_data(url).domain_name domain_name = self.get_whois_data(url).domain_name
if type(domain_name) is list: if type(domain_name) is list:
return domain_name[0].lower() return domain_name[0].lower()
else: else:
@ -184,29 +184,29 @@ class json_url_data(object):
###################################### ######################################
""" """
Get initial and final URLs Get initial and final URLs
Compare whether the final (destination) URL Compare whether the final (destination) URL
matches with the initial URL in a request. matches with the initial URL in a request.
Returns a dict object. Returns a dict object.
""" """
def get_startfinal_urls(self, url): def get_startfinal_urls(self, url):
response = self.set_session(url) response = self.set_session(url)
end_url = response.url end_url = response.url
start_match = False start_match = False
final_match = False final_match = False
# dr = re.compile(r"^([a-z]+://)?([^/]+)") # dr = re.compile(r"^([a-z]+://)?([^/]+)")
# dr_group_lastindex = dr.match(url).lastindex # dr_group_lastindex = dr.match(url).lastindex
# domain_name = dr.match(url).group(dr_group_lastindex) # domain_name = dr.match(url).group(dr_group_lastindex)
domain_name = self.get_domain_name(url) domain_name = self.get_domain_name(url)
if re.search(domain_name, end_url): if re.search(domain_name, end_url):
final_match = True final_match = True
dict_data = { dict_data = {
'startfinal_urls': { 'startfinal_urls': {
'start_url': { 'start_url': {
@ -217,13 +217,13 @@ class json_url_data(object):
} }
} }
} }
return dict_data return dict_data
###################################### ######################################
""" """
Get domain registrar Get domain registrar
Returns a dict object. Returns a dict object.
""" """
def get_domain_registrar(self, url): def get_domain_registrar(self, url):
@ -235,21 +235,21 @@ class json_url_data(object):
Do comparison between the domain name, extracted Do comparison between the domain name, extracted
from WHOIS domain data and contents of a title HTML from WHOIS domain data and contents of a title HTML
element, extracted from HTML data based on a given URL. element, extracted from HTML data based on a given URL.
Returns a dict object. Returns a dict object.
""" """
def get_domain_title_match(self, url): def get_domain_title_match(self, url):
domain_name = self.get_domain_name(url) domain_name = self.get_domain_name(url)
title = self.get_webpage_title(url) title = self.get_webpage_title(url)
# If is string: # If is string:
if type(domain_name) is str: if type(domain_name) is str:
if re.search(domain_name, title, re.IGNORECASE): if re.search(domain_name, title, re.IGNORECASE):
match = True match = True
else: else:
match = False match = False
# If is list: # If is list:
elif type(domain_name) is list: elif type(domain_name) is list:
for d in domain_name: for d in domain_name:
@ -260,49 +260,49 @@ class json_url_data(object):
match = False match = False
else: else:
match = False match = False
dict_data = { dict_data = {
'webpage_title': title, 'webpage_title': title,
'domain_in_webpage_title': match 'domain_in_webpage_title': match
} }
return dict_data return dict_data
###################################### ######################################
""" """
Get a single timestamp from given data Get a single timestamp from given data
Two scenarios are considered: dates argument is either Two scenarios are considered: dates argument is either
a list or a string. If it is a list, then we need a list or a string. If it is a list, then we need
to decide which date value to extract. to decide which date value to extract.
Returns a date object. Returns a date object.
""" """
def get_single_date(self, dates, newest=False): def get_single_date(self, dates, newest=False):
dates_epoch = [] dates_epoch = []
if type(dates) is list: if type(dates) is list:
for d in dates: for d in dates:
dates_epoch.append(d.timestamp()) dates_epoch.append(d.timestamp())
else: else:
dates_epoch.append(dates.timestamp()) dates_epoch.append(dates.timestamp())
return datetime.fromtimestamp(sorted(dates_epoch, reverse=newest)[0]) return datetime.fromtimestamp(sorted(dates_epoch, reverse=newest)[0])
###################################### ######################################
""" """
Get domain time information based on WHOIS domain data. Get domain time information based on WHOIS domain data.
Returns a dict object. Returns a dict object.
""" """
def get_domain_timeinfo(self, url): def get_domain_timeinfo(self, url):
whois_data = self.get_whois_data(url) whois_data = self.get_whois_data(url)
domain_creation_date = self.get_single_date(whois_data.creation_date, newest = False) domain_creation_date = self.get_single_date(whois_data.creation_date, newest = False)
domain_updated_date = self.get_single_date(whois_data.updated_date, newest = False) domain_updated_date = self.get_single_date(whois_data.updated_date, newest = False)
domain_expiration_date = self.get_single_date(whois_data.expiration_date, newest = False) domain_expiration_date = self.get_single_date(whois_data.expiration_date, newest = False)
dict_data = { dict_data = {
'domain_timestamps': 'domain_timestamps':
{ {
@ -311,25 +311,25 @@ class json_url_data(object):
'expires': domain_expiration_date.strftime(dateformat) 'expires': domain_expiration_date.strftime(dateformat)
} }
} }
return dict_data return dict_data
###################################### ######################################
""" """
Get domain time information based on WHOIS domain data, Get domain time information based on WHOIS domain data,
relative to the current date (UTC time). relative to the current date (UTC time).
Returns a dict object. Returns a dict object.
""" """
def get_domain_timeinfo_relative(self, url): def get_domain_timeinfo_relative(self, url):
date_now = datetime.utcnow() date_now = datetime.utcnow()
whois_data = self.get_whois_data(url) whois_data = self.get_whois_data(url)
domain_creation_date = self.get_single_date(whois_data.creation_date, newest = False) domain_creation_date = self.get_single_date(whois_data.creation_date, newest = False)
domain_updated_date = self.get_single_date(whois_data.updated_date, newest = False) domain_updated_date = self.get_single_date(whois_data.updated_date, newest = False)
domain_expiration_date = self.get_single_date(whois_data.expiration_date, newest = False) domain_expiration_date = self.get_single_date(whois_data.expiration_date, newest = False)
dict_data = { dict_data = {
'domain_timestamps_relative': 'domain_timestamps_relative':
{ {
@ -339,7 +339,7 @@ class json_url_data(object):
'expires_days_left': (domain_expiration_date - date_now).days 'expires_days_left': (domain_expiration_date - date_now).days
} }
} }
return dict_data return dict_data
###################################### ######################################
@ -348,15 +348,15 @@ class json_url_data(object):
'../foo/bar/' '../foo/bar/'
'/foo/../../bar/, '/foo/../../bar/,
'https://foo.bar/foo/../' 'https://foo.bar/foo/../'
etc. etc.
Returns a boolean object. Returns a boolean object.
""" """
def is_multidot_url(self, url): def is_multidot_url(self, url):
multidot = re.compile(r".*[.]{2}/.*") multidot = re.compile(r".*[.]{2}/.*")
if multidot.match(url): if multidot.match(url):
return True return True
return False return False
@ -364,13 +364,13 @@ class json_url_data(object):
###################################### ######################################
""" """
Get HTML element data from HTML data contents. Get HTML element data from HTML data contents.
Two fetching methods are supported: Two fetching methods are supported:
- A) use only HTML element/tag name and extract raw contents of - A) use only HTML element/tag name and extract raw contents of
these tags these tags
- B) use both HTML element/tag name and more fine-grained - B) use both HTML element/tag name and more fine-grained
inner attribute name to determine which HTML elements are extracted inner attribute name to determine which HTML elements are extracted
Special case - URL link references: Special case - URL link references:
- attributes 'href' or 'src' are considered as link referrals and - attributes 'href' or 'src' are considered as link referrals and
they are handled in a special way they are handled in a special way
@ -378,55 +378,55 @@ class json_url_data(object):
(patterns: '/', '#', '../' and '/<anything>') (patterns: '/', '#', '../' and '/<anything>')
- B) link referrals to external domains are placed in 'ext_refs' list - B) link referrals to external domains are placed in 'ext_refs' list
(patterns such as 'https://foo.bar.dot/fancysite' etc.) (patterns such as 'https://foo.bar.dot/fancysite' etc.)
- Both A) and B) link categories have 'normal' and 'multidot' subcategories - Both A) and B) link categories have 'normal' and 'multidot' subcategories
- normal links do not contain pattern '../' - normal links do not contain pattern '../'
- multidot links contain '../' pattern - multidot links contain '../' pattern
Returns a dict object. Returns a dict object.
""" """
def get_tag_data(self, url, tag, attribute=None): def get_tag_data(self, url, tag, attribute=None):
html_data = self.get_html_data(url) html_data = self.get_html_data(url)
domain_name = self.get_domain_name(url) domain_name = self.get_domain_name(url)
data = [] data = []
if attribute != None: if attribute != None:
for d in html_data.find_all(tag): for d in html_data.find_all(tag):
# Ignore the HTML tag if it does not contain our attribute # Ignore the HTML tag if it does not contain our attribute
if d.get(attribute) != None: if d.get(attribute) != None:
data.append(d.get(attribute)) data.append(d.get(attribute))
if attribute == 'href' or attribute == 'src': if attribute == 'href' or attribute == 'src':
self_refs = { 'normal': [], 'multidot': []} self_refs = { 'normal': [], 'multidot': []}
ext_refs = { 'normal': [], 'multidot': []} ext_refs = { 'normal': [], 'multidot': []}
# Syntax: '#<anything>', '/<anything>', '../<anything>' # Syntax: '#<anything>', '/<anything>', '../<anything>'
rs = re.compile(r"^[/#]|^[.]{2}/.*") rs = re.compile(r"^[/#]|^[.]{2}/.*")
# Syntax: '<text>:<text>/' # Syntax: '<text>:<text>/'
rd = re.compile(r"^[a-z]+:[a-z]+/") rd = re.compile(r"^[a-z]+:[a-z]+/")
# Syntax examples: # Syntax examples:
# 'http://foo.bar/', 'https://foo.bar/, 'foo.bar/', 'https://virus.foo.bar/' # 'http://foo.bar/', 'https://foo.bar/, 'foo.bar/', 'https://virus.foo.bar/'
rl = re.compile(r"^([a-z]+://)?([^/]*" + domain_name + "/)") rl = re.compile(r"^([a-z]+://)?([^/]*" + domain_name + "/)")
for s in data: for s in data:
# Ignore mailto links # Ignore mailto links
if re.match("^mailto:", s): continue if re.match("^mailto:", s): continue
if rs.match(s) or rl.match(s) or rd.match(s): if rs.match(s) or rl.match(s) or rd.match(s):
if self.is_multidot_url(s): if self.is_multidot_url(s):
self_refs['multidot'].append(s) self_refs['multidot'].append(s)
else: else:
self_refs['normal'].append(s) self_refs['normal'].append(s)
else: else:
if self.is_multidot_url(s): if self.is_multidot_url(s):
try: try:
ext_refs['multidot'].append({'url': s, 'registrar': self.get_whois_data(s).registrar }) ext_refs['multidot'].append({'url': s, 'registrar': self.get_whois_data(s).registrar })
@ -440,31 +440,31 @@ class json_url_data(object):
except: except:
ext_refs['normal'].append({'url': s, 'registrar': None }) ext_refs['normal'].append({'url': s, 'registrar': None })
pass pass
data = None data = None
dict_data = { dict_data = {
tag: { tag: {
attribute + '_ext': (ext_refs), attribute + '_ext': (ext_refs),
attribute + '_self': (self_refs) attribute + '_self': (self_refs)
} }
} }
else: else:
dict_data = { dict_data = {
tag: { tag: {
attribute: (data) attribute: (data)
} }
} }
else: else:
for d in html_data.find_all(tag): for d in html_data.find_all(tag):
data.append(d.prettify()) data.append(d.prettify())
dict_data = { dict_data = {
tag: (data) tag: (data)
} }
return dict_data return dict_data
###################################### ######################################
@ -473,21 +473,21 @@ class json_url_data(object):
the webpage itself? the webpage itself?
""" """
def get_registrar_count(self, registrar, urls): def get_registrar_count(self, registrar, urls):
i = 0 i = 0
for u in urls: for u in urls:
for k,v in u.items(): for k,v in u.items():
if k == 'registrar' and v == registrar: if k == 'registrar' and v == registrar:
i += 1 i += 1
o = len(urls) - i o = len(urls) - i
dict_data = { dict_data = {
'same_registrar_count': i, 'same_registrar_count': i,
'other_registrar_count': o 'other_registrar_count': o
} }
return dict_data return dict_data
###################################### ######################################
@ -495,9 +495,9 @@ class json_url_data(object):
""" """
Get values existing in a dict object, Get values existing in a dict object,
based on a known key string. based on a known key string.
Returns a list object. Returns a list object.
TODO: Major re-work for the fetch function TODO: Major re-work for the fetch function
TODO: Support for more sophisticated JSON key string filtering TODO: Support for more sophisticated JSON key string filtering
@ -563,7 +563,7 @@ class json_url_data(object):
if flatten == True: if flatten == True:
for u in get_data_extract(data_extract): for u in get_data_extract(data_extract):
flat_data.append(u) flat_data.append(u)
return flat_data return flat_data
else: else:
return data_extract return data_extract
@ -573,21 +573,21 @@ class json_url_data(object):
Compile URL related data. Compile URL related data.
""" """
def get_url_data(self, url): def get_url_data(self, url):
# Dict object for simple, non-nested data # Dict object for simple, non-nested data
data_simple = {} data_simple = {}
# Pre-defined dict object for specific data sets # Pre-defined dict object for specific data sets
webpage_data = {} webpage_data = {}
startfinal_url = self.get_startfinal_urls(url) startfinal_url = self.get_startfinal_urls(url)
redirect_url = self.get_url_redirects(url) redirect_url = self.get_url_redirects(url)
domain_registrar = self.get_domain_registrar(url) domain_registrar = self.get_domain_registrar(url)
domaintitle_match = self.get_domain_title_match(url) domaintitle_match = self.get_domain_title_match(url)
domain_time_relative = self.get_domain_timeinfo_relative(url) domain_time_relative = self.get_domain_timeinfo_relative(url)
domain_time = self.get_domain_timeinfo(url) domain_time = self.get_domain_timeinfo(url)
html_element_iframe = self.get_tag_data(url, 'iframe') html_element_iframe = self.get_tag_data(url, 'iframe')
html_element_a_href = self.get_tag_data(url, 'a', link_refs['a']) html_element_a_href = self.get_tag_data(url, 'a', link_refs['a'])
html_element_img_src = self.get_tag_data(url, 'img', link_refs['img']) html_element_img_src = self.get_tag_data(url, 'img', link_refs['img'])
@ -597,53 +597,53 @@ class json_url_data(object):
'iframes_count': 'iframes_count':
len(self.json_fetcher(html_element_iframe, 'iframe').get_data()) len(self.json_fetcher(html_element_iframe, 'iframe').get_data())
} }
multidot_urls_count = { multidot_urls_count = {
'multidot_url_count': 'multidot_url_count':
len(self.json_fetcher(html_element_a_href, 'multidot').get_data()) + len(self.json_fetcher(html_element_img_src, 'multidot').get_data()) + len(self.json_fetcher(html_element_script_src, 'multidot').get_data()) len(self.json_fetcher(html_element_a_href, 'multidot').get_data()) + len(self.json_fetcher(html_element_img_src, 'multidot').get_data()) + len(self.json_fetcher(html_element_script_src, 'multidot').get_data())
} }
################### ###################
def get_total_registrars(): def get_total_registrars():
same_registrar_counts = 0 same_registrar_counts = 0
other_registrar_counts = 0 other_registrar_counts = 0
for k,v in link_refs.items(): for k,v in link_refs.items():
html_element = self.get_tag_data(url, k, v) html_element = self.get_tag_data(url, k, v)
same_registrar_counts += self.get_registrar_count( same_registrar_counts += self.get_registrar_count(
domain_registrar['domain_registrar'], domain_registrar['domain_registrar'],
html_element[k][v + '_ext']['normal'] html_element[k][v + '_ext']['normal']
)['same_registrar_count'] )['same_registrar_count']
other_registrar_counts += self.get_registrar_count( other_registrar_counts += self.get_registrar_count(
domain_registrar['domain_registrar'], domain_registrar['domain_registrar'],
html_element[k][v + '_ext']['normal'] html_element[k][v + '_ext']['normal']
)['other_registrar_count'] )['other_registrar_count']
registrar_counts = { registrar_counts = {
'same_registrar_count': same_registrar_counts, 'same_registrar_count': same_registrar_counts,
'other_registrar_count': other_registrar_counts 'other_registrar_count': other_registrar_counts
} }
return registrar_counts return registrar_counts
# Avoid unnecessary nesting of the following data # Avoid unnecessary nesting of the following data
data_simple.update(domain_registrar) data_simple.update(domain_registrar)
data_simple.update(domaintitle_match) data_simple.update(domaintitle_match)
data_simple.update(iframes_count) data_simple.update(iframes_count)
data_simple.update(multidot_urls_count) data_simple.update(multidot_urls_count)
data_simple.update(get_total_registrars()) data_simple.update(get_total_registrars())
url_data = dict({ url_data = dict({
url: [ url: [
data_simple, data_simple,
startfinal_url, startfinal_url,
{'redirects': redirect_url}, {'redirects': redirect_url},
domain_time_relative, domain_time_relative,
domain_time, domain_time,
{'webpage_data': [ {'webpage_data': [
html_element_iframe, html_element_iframe,
html_element_a_href, html_element_a_href,
@ -653,7 +653,7 @@ class json_url_data(object):
} }
] ]
}) })
return url_data return url_data
@ -667,11 +667,11 @@ class write_operations(object):
""" """
Set JSON file name, append number suffix Set JSON file name, append number suffix
# if file exists already. # if file exists already.
Returns file name path. Returns file name path.
""" """
def set_filename(self): def set_filename(self):
c = 0 c = 0
while True: while True:
if os.path.exists(self.filename): if os.path.exists(self.filename):
@ -689,7 +689,7 @@ class write_operations(object):
Append to a JSON file. Append to a JSON file.
""" """
def write_to_file(self, data): def write_to_file(self, data):
try: try:
json_file = open(self.filename, "a") json_file = open(self.filename, "a")
json_file.write(data) json_file.write(data)
@ -762,7 +762,7 @@ class data_visualization(object):
'local_urls': link_count(unique_refs, '_self'), 'local_urls': link_count(unique_refs, '_self'),
'external_urls': link_count(unique_refs, '_ext') 'external_urls': link_count(unique_refs, '_ext')
} }
return data return data
def get_registrars(self): def get_registrars(self):
@ -778,7 +778,7 @@ class data_visualization(object):
return registrars return registrars
def get_registrar_count_summary(self): def get_registrar_count_summary(self):
domain_counter = dict(Counter(self.get_registrars())) domain_counter = dict(Counter(self.get_registrars()))
data = {'fetched_domains': domain_counter, 'url_domain_registrar': self.domain_registrar } data = {'fetched_domains': domain_counter, 'url_domain_registrar': self.domain_registrar }
return data return data
@ -807,7 +807,7 @@ if __name__ == '__main__':
# Get URLs from an available JSON data # Get URLs from an available JSON data
for key_url in json_data.keys(): for key_url in json_data.keys():
print("Generating statistics: %s" % key_url) print("Generating statistics: %s" % key_url)
fig = plt.figure() fig = plt.figure()
@ -817,19 +817,19 @@ if __name__ == '__main__':
# 'figure.constrained_layout.use': True # 'figure.constrained_layout.use': True
} }
plt.rcParams.update(fig_params) plt.rcParams.update(fig_params)
domain_string = url_str_pattern.split(key_url)[2].replace('.','') domain_string = url_str_pattern.split(key_url)[2].replace('.','')
summary = data_visualization(key_url, json_data) summary = data_visualization(key_url, json_data)
summary_registrars = summary.get_registrar_count_summary()['fetched_domains'] summary_registrars = summary.get_registrar_count_summary()['fetched_domains']
x_r = list(summary_registrars.keys()) x_r = list(summary_registrars.keys())
y_r = list(summary_registrars.values()) y_r = list(summary_registrars.values())
# Show bar values # Show bar values
for index,data in enumerate(y_r): for index,data in enumerate(y_r):
plt.text(x=index, y=data+0.5, s=data, fontdict=dict(fontsize=8)) plt.text(x=index, y=data+0.5, s=data, fontdict=dict(fontsize=8))
title_r = "Domains associated with HTML URL data (" + key_url + ")" title_r = "Domains associated with HTML URL data (" + key_url + ")"
xlabel_r = "Fetched domains" xlabel_r = "Fetched domains"
ylabel_r = "Domain count" ylabel_r = "Domain count"
@ -845,18 +845,17 @@ if __name__ == '__main__':
plt.show() plt.show()
#fig_u = plt.figure() #fig_u = plt.figure()
#summary_urls = summary.get_urls_count_summary() #summary_urls = summary.get_urls_count_summary()
#x_u = list(summary_urls.keys()) #x_u = list(summary_urls.keys())
#y_u = list(summary_urls.values()) #y_u = list(summary_urls.values())
#title_u = "Local and external URL references (" + key_url + ")" #title_u = "Local and external URL references (" + key_url + ")"
#xlabel_u = "Fetched URLs" #xlabel_u = "Fetched URLs"
#ylabel_u = "URL count" #ylabel_u = "URL count"
#plt.bar(x_u, y_u, color="blue", edgecolor='black') #plt.bar(x_u, y_u, color="blue", edgecolor='black')
#plt.title(title_u) #plt.title(title_u)
#plt.xlabel(xlabel_u) #plt.xlabel(xlabel_u)
#plt.ylabel(ylabel_u) #plt.ylabel(ylabel_u)
#plt.show() #plt.show()

Loading…
Cancel
Save