|
|
@ -96,21 +96,21 @@ class json_url_data(object): |
|
|
|
Returns a requests.models.Response object. |
|
|
|
""" |
|
|
|
def set_session(self, url, method='get', redirects=True): |
|
|
|
|
|
|
|
|
|
|
|
# HTTP response status codes 1XX, 2XX and 3XX are OK |
|
|
|
# Treat other codes as errors |
|
|
|
sc = re.compile(r"^[123]{1}[0-9]{2}") |
|
|
|
|
|
|
|
|
|
|
|
sleep(sleep_interval_between_requests) |
|
|
|
|
|
|
|
|
|
|
|
try: |
|
|
|
session = requests.Session() |
|
|
|
response = session.request(method, url, headers=request_headers, allow_redirects=redirects) |
|
|
|
|
|
|
|
|
|
|
|
if not sc.match(str(response.status_code)): |
|
|
|
raise Exception("Error: got invalid response status from the web server") |
|
|
|
return response |
|
|
|
|
|
|
|
|
|
|
|
except: |
|
|
|
raise Exception("Error: HTTP session could not be established. URL: '" + url + "' (method: " + method + ")") from None |
|
|
|
|
|
|
@ -121,7 +121,7 @@ class json_url_data(object): |
|
|
|
Returns a bs4.BeautifulSoup object. |
|
|
|
""" |
|
|
|
def get_html_data(self, url): |
|
|
|
|
|
|
|
|
|
|
|
try: |
|
|
|
data = bs(self.set_session(url).content, 'html.parser') |
|
|
|
return data |
|
|
@ -135,15 +135,15 @@ class json_url_data(object): |
|
|
|
Returns a list object. |
|
|
|
""" |
|
|
|
def get_url_redirects(self, url): |
|
|
|
|
|
|
|
|
|
|
|
response = self.set_session(url) |
|
|
|
list_data = [] |
|
|
|
|
|
|
|
|
|
|
|
if response.history: |
|
|
|
|
|
|
|
|
|
|
|
for r in response.history: |
|
|
|
list_data.append({'redirect_url': r.url, 'status': r.status_code}) |
|
|
|
|
|
|
|
|
|
|
|
return list_data |
|
|
|
|
|
|
|
###################################### |
|
|
@ -153,9 +153,9 @@ class json_url_data(object): |
|
|
|
Returns a string object. |
|
|
|
""" |
|
|
|
def get_webpage_title(self, url): |
|
|
|
|
|
|
|
|
|
|
|
html_data = self.get_html_data(url) |
|
|
|
|
|
|
|
|
|
|
|
title = html_data.title.string |
|
|
|
return title |
|
|
|
|
|
|
@ -175,7 +175,7 @@ class json_url_data(object): |
|
|
|
""" |
|
|
|
def get_domain_name(self, url): |
|
|
|
domain_name = self.get_whois_data(url).domain_name |
|
|
|
|
|
|
|
|
|
|
|
if type(domain_name) is list: |
|
|
|
return domain_name[0].lower() |
|
|
|
else: |
|
|
@ -184,29 +184,29 @@ class json_url_data(object): |
|
|
|
###################################### |
|
|
|
""" |
|
|
|
Get initial and final URLs |
|
|
|
|
|
|
|
|
|
|
|
Compare whether the final (destination) URL |
|
|
|
matches with the initial URL in a request. |
|
|
|
|
|
|
|
|
|
|
|
Returns a dict object. |
|
|
|
""" |
|
|
|
def get_startfinal_urls(self, url): |
|
|
|
|
|
|
|
|
|
|
|
response = self.set_session(url) |
|
|
|
end_url = response.url |
|
|
|
|
|
|
|
|
|
|
|
start_match = False |
|
|
|
final_match = False |
|
|
|
|
|
|
|
|
|
|
|
# dr = re.compile(r"^([a-z]+://)?([^/]+)") |
|
|
|
# dr_group_lastindex = dr.match(url).lastindex |
|
|
|
# domain_name = dr.match(url).group(dr_group_lastindex) |
|
|
|
|
|
|
|
|
|
|
|
domain_name = self.get_domain_name(url) |
|
|
|
|
|
|
|
|
|
|
|
if re.search(domain_name, end_url): |
|
|
|
final_match = True |
|
|
|
|
|
|
|
|
|
|
|
dict_data = { |
|
|
|
'startfinal_urls': { |
|
|
|
'start_url': { |
|
|
@ -217,13 +217,13 @@ class json_url_data(object): |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
return dict_data |
|
|
|
|
|
|
|
###################################### |
|
|
|
""" |
|
|
|
Get domain registrar |
|
|
|
|
|
|
|
|
|
|
|
Returns a dict object. |
|
|
|
""" |
|
|
|
def get_domain_registrar(self, url): |
|
|
@ -235,21 +235,21 @@ class json_url_data(object): |
|
|
|
Do comparison between the domain name, extracted |
|
|
|
from WHOIS domain data and contents of a title HTML |
|
|
|
element, extracted from HTML data based on a given URL. |
|
|
|
|
|
|
|
|
|
|
|
Returns a dict object. |
|
|
|
""" |
|
|
|
def get_domain_title_match(self, url): |
|
|
|
|
|
|
|
|
|
|
|
domain_name = self.get_domain_name(url) |
|
|
|
title = self.get_webpage_title(url) |
|
|
|
|
|
|
|
|
|
|
|
# If is string: |
|
|
|
if type(domain_name) is str: |
|
|
|
if re.search(domain_name, title, re.IGNORECASE): |
|
|
|
match = True |
|
|
|
else: |
|
|
|
match = False |
|
|
|
|
|
|
|
|
|
|
|
# If is list: |
|
|
|
elif type(domain_name) is list: |
|
|
|
for d in domain_name: |
|
|
@ -260,49 +260,49 @@ class json_url_data(object): |
|
|
|
match = False |
|
|
|
else: |
|
|
|
match = False |
|
|
|
|
|
|
|
|
|
|
|
dict_data = { |
|
|
|
'webpage_title': title, |
|
|
|
'domain_in_webpage_title': match |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
return dict_data |
|
|
|
|
|
|
|
###################################### |
|
|
|
""" |
|
|
|
Get a single timestamp from given data |
|
|
|
|
|
|
|
|
|
|
|
Two scenarios are considered: dates argument is either |
|
|
|
a list or a string. If it is a list, then we need |
|
|
|
to decide which date value to extract. |
|
|
|
|
|
|
|
|
|
|
|
Returns a date object. |
|
|
|
""" |
|
|
|
def get_single_date(self, dates, newest=False): |
|
|
|
|
|
|
|
|
|
|
|
dates_epoch = [] |
|
|
|
|
|
|
|
|
|
|
|
if type(dates) is list: |
|
|
|
for d in dates: |
|
|
|
dates_epoch.append(d.timestamp()) |
|
|
|
else: |
|
|
|
dates_epoch.append(dates.timestamp()) |
|
|
|
|
|
|
|
|
|
|
|
return datetime.fromtimestamp(sorted(dates_epoch, reverse=newest)[0]) |
|
|
|
|
|
|
|
###################################### |
|
|
|
""" |
|
|
|
Get domain time information based on WHOIS domain data. |
|
|
|
|
|
|
|
|
|
|
|
Returns a dict object. |
|
|
|
""" |
|
|
|
def get_domain_timeinfo(self, url): |
|
|
|
|
|
|
|
|
|
|
|
whois_data = self.get_whois_data(url) |
|
|
|
domain_creation_date = self.get_single_date(whois_data.creation_date, newest = False) |
|
|
|
domain_updated_date = self.get_single_date(whois_data.updated_date, newest = False) |
|
|
|
domain_expiration_date = self.get_single_date(whois_data.expiration_date, newest = False) |
|
|
|
|
|
|
|
|
|
|
|
dict_data = { |
|
|
|
'domain_timestamps': |
|
|
|
{ |
|
|
@ -311,25 +311,25 @@ class json_url_data(object): |
|
|
|
'expires': domain_expiration_date.strftime(dateformat) |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
return dict_data |
|
|
|
|
|
|
|
###################################### |
|
|
|
""" |
|
|
|
Get domain time information based on WHOIS domain data, |
|
|
|
relative to the current date (UTC time). |
|
|
|
|
|
|
|
|
|
|
|
Returns a dict object. |
|
|
|
""" |
|
|
|
def get_domain_timeinfo_relative(self, url): |
|
|
|
|
|
|
|
|
|
|
|
date_now = datetime.utcnow() |
|
|
|
|
|
|
|
|
|
|
|
whois_data = self.get_whois_data(url) |
|
|
|
domain_creation_date = self.get_single_date(whois_data.creation_date, newest = False) |
|
|
|
domain_updated_date = self.get_single_date(whois_data.updated_date, newest = False) |
|
|
|
domain_expiration_date = self.get_single_date(whois_data.expiration_date, newest = False) |
|
|
|
|
|
|
|
|
|
|
|
dict_data = { |
|
|
|
'domain_timestamps_relative': |
|
|
|
{ |
|
|
@ -339,7 +339,7 @@ class json_url_data(object): |
|
|
|
'expires_days_left': (domain_expiration_date - date_now).days |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
return dict_data |
|
|
|
|
|
|
|
###################################### |
|
|
@ -348,15 +348,15 @@ class json_url_data(object): |
|
|
|
'../foo/bar/' |
|
|
|
'/foo/../../bar/, |
|
|
|
'https://foo.bar/foo/../' |
|
|
|
|
|
|
|
|
|
|
|
etc. |
|
|
|
|
|
|
|
|
|
|
|
Returns a boolean object. |
|
|
|
""" |
|
|
|
def is_multidot_url(self, url): |
|
|
|
|
|
|
|
|
|
|
|
multidot = re.compile(r".*[.]{2}/.*") |
|
|
|
|
|
|
|
|
|
|
|
if multidot.match(url): |
|
|
|
return True |
|
|
|
return False |
|
|
@ -364,13 +364,13 @@ class json_url_data(object): |
|
|
|
###################################### |
|
|
|
""" |
|
|
|
Get HTML element data from HTML data contents. |
|
|
|
|
|
|
|
|
|
|
|
Two fetching methods are supported: |
|
|
|
- A) use only HTML element/tag name and extract raw contents of |
|
|
|
these tags |
|
|
|
- B) use both HTML element/tag name and more fine-grained |
|
|
|
inner attribute name to determine which HTML elements are extracted |
|
|
|
|
|
|
|
|
|
|
|
Special case - URL link references: |
|
|
|
- attributes 'href' or 'src' are considered as link referrals and |
|
|
|
they are handled in a special way |
|
|
@ -378,55 +378,55 @@ class json_url_data(object): |
|
|
|
(patterns: '/', '#', '../' and '/<anything>') |
|
|
|
- B) link referrals to external domains are placed in 'ext_refs' list |
|
|
|
(patterns such as 'https://foo.bar.dot/fancysite' etc.) |
|
|
|
|
|
|
|
|
|
|
|
- Both A) and B) link categories have 'normal' and 'multidot' subcategories |
|
|
|
- normal links do not contain pattern '../' |
|
|
|
- multidot links contain '../' pattern |
|
|
|
|
|
|
|
|
|
|
|
Returns a dict object. |
|
|
|
""" |
|
|
|
|
|
|
|
|
|
|
|
def get_tag_data(self, url, tag, attribute=None): |
|
|
|
|
|
|
|
|
|
|
|
html_data = self.get_html_data(url) |
|
|
|
domain_name = self.get_domain_name(url) |
|
|
|
data = [] |
|
|
|
|
|
|
|
|
|
|
|
if attribute != None: |
|
|
|
|
|
|
|
|
|
|
|
for d in html_data.find_all(tag): |
|
|
|
|
|
|
|
|
|
|
|
# Ignore the HTML tag if it does not contain our attribute |
|
|
|
if d.get(attribute) != None: |
|
|
|
data.append(d.get(attribute)) |
|
|
|
|
|
|
|
|
|
|
|
if attribute == 'href' or attribute == 'src': |
|
|
|
|
|
|
|
|
|
|
|
self_refs = { 'normal': [], 'multidot': []} |
|
|
|
ext_refs = { 'normal': [], 'multidot': []} |
|
|
|
|
|
|
|
|
|
|
|
# Syntax: '#<anything>', '/<anything>', '../<anything>' |
|
|
|
rs = re.compile(r"^[/#]|^[.]{2}/.*") |
|
|
|
|
|
|
|
|
|
|
|
# Syntax: '<text>:<text>/' |
|
|
|
rd = re.compile(r"^[a-z]+:[a-z]+/") |
|
|
|
|
|
|
|
|
|
|
|
# Syntax examples: |
|
|
|
# 'http://foo.bar/', 'https://foo.bar/, 'foo.bar/', 'https://virus.foo.bar/' |
|
|
|
rl = re.compile(r"^([a-z]+://)?([^/]*" + domain_name + "/)") |
|
|
|
|
|
|
|
|
|
|
|
for s in data: |
|
|
|
|
|
|
|
|
|
|
|
# Ignore mailto links |
|
|
|
if re.match("^mailto:", s): continue |
|
|
|
|
|
|
|
|
|
|
|
if rs.match(s) or rl.match(s) or rd.match(s): |
|
|
|
if self.is_multidot_url(s): |
|
|
|
self_refs['multidot'].append(s) |
|
|
|
else: |
|
|
|
self_refs['normal'].append(s) |
|
|
|
else: |
|
|
|
|
|
|
|
|
|
|
|
if self.is_multidot_url(s): |
|
|
|
try: |
|
|
|
ext_refs['multidot'].append({'url': s, 'registrar': self.get_whois_data(s).registrar }) |
|
|
@ -440,31 +440,31 @@ class json_url_data(object): |
|
|
|
except: |
|
|
|
ext_refs['normal'].append({'url': s, 'registrar': None }) |
|
|
|
pass |
|
|
|
|
|
|
|
|
|
|
|
data = None |
|
|
|
|
|
|
|
|
|
|
|
dict_data = { |
|
|
|
tag: { |
|
|
|
attribute + '_ext': (ext_refs), |
|
|
|
attribute + '_self': (self_refs) |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
else: |
|
|
|
dict_data = { |
|
|
|
tag: { |
|
|
|
attribute: (data) |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
else: |
|
|
|
for d in html_data.find_all(tag): |
|
|
|
data.append(d.prettify()) |
|
|
|
|
|
|
|
|
|
|
|
dict_data = { |
|
|
|
tag: (data) |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
return dict_data |
|
|
|
|
|
|
|
###################################### |
|
|
@ -473,21 +473,21 @@ class json_url_data(object): |
|
|
|
the webpage itself? |
|
|
|
""" |
|
|
|
def get_registrar_count(self, registrar, urls): |
|
|
|
|
|
|
|
|
|
|
|
i = 0 |
|
|
|
|
|
|
|
|
|
|
|
for u in urls: |
|
|
|
for k,v in u.items(): |
|
|
|
if k == 'registrar' and v == registrar: |
|
|
|
i += 1 |
|
|
|
|
|
|
|
|
|
|
|
o = len(urls) - i |
|
|
|
|
|
|
|
|
|
|
|
dict_data = { |
|
|
|
'same_registrar_count': i, |
|
|
|
'other_registrar_count': o |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
return dict_data |
|
|
|
|
|
|
|
###################################### |
|
|
@ -495,9 +495,9 @@ class json_url_data(object): |
|
|
|
""" |
|
|
|
Get values existing in a dict object, |
|
|
|
based on a known key string. |
|
|
|
|
|
|
|
|
|
|
|
Returns a list object. |
|
|
|
|
|
|
|
|
|
|
|
TODO: Major re-work for the fetch function |
|
|
|
|
|
|
|
TODO: Support for more sophisticated JSON key string filtering |
|
|
@ -563,7 +563,7 @@ class json_url_data(object): |
|
|
|
if flatten == True: |
|
|
|
for u in get_data_extract(data_extract): |
|
|
|
flat_data.append(u) |
|
|
|
|
|
|
|
|
|
|
|
return flat_data |
|
|
|
else: |
|
|
|
return data_extract |
|
|
@ -573,21 +573,21 @@ class json_url_data(object): |
|
|
|
Compile URL related data. |
|
|
|
""" |
|
|
|
def get_url_data(self, url): |
|
|
|
|
|
|
|
|
|
|
|
# Dict object for simple, non-nested data |
|
|
|
data_simple = {} |
|
|
|
|
|
|
|
# Pre-defined dict object for specific data sets |
|
|
|
webpage_data = {} |
|
|
|
|
|
|
|
|
|
|
|
startfinal_url = self.get_startfinal_urls(url) |
|
|
|
redirect_url = self.get_url_redirects(url) |
|
|
|
domain_registrar = self.get_domain_registrar(url) |
|
|
|
domaintitle_match = self.get_domain_title_match(url) |
|
|
|
|
|
|
|
|
|
|
|
domain_time_relative = self.get_domain_timeinfo_relative(url) |
|
|
|
domain_time = self.get_domain_timeinfo(url) |
|
|
|
|
|
|
|
|
|
|
|
html_element_iframe = self.get_tag_data(url, 'iframe') |
|
|
|
html_element_a_href = self.get_tag_data(url, 'a', link_refs['a']) |
|
|
|
html_element_img_src = self.get_tag_data(url, 'img', link_refs['img']) |
|
|
@ -597,53 +597,53 @@ class json_url_data(object): |
|
|
|
'iframes_count': |
|
|
|
len(self.json_fetcher(html_element_iframe, 'iframe').get_data()) |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
multidot_urls_count = { |
|
|
|
'multidot_url_count': |
|
|
|
len(self.json_fetcher(html_element_a_href, 'multidot').get_data()) + len(self.json_fetcher(html_element_img_src, 'multidot').get_data()) + len(self.json_fetcher(html_element_script_src, 'multidot').get_data()) |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
################### |
|
|
|
def get_total_registrars(): |
|
|
|
|
|
|
|
same_registrar_counts = 0 |
|
|
|
other_registrar_counts = 0 |
|
|
|
for k,v in link_refs.items(): |
|
|
|
|
|
|
|
|
|
|
|
html_element = self.get_tag_data(url, k, v) |
|
|
|
|
|
|
|
|
|
|
|
same_registrar_counts += self.get_registrar_count( |
|
|
|
domain_registrar['domain_registrar'], |
|
|
|
html_element[k][v + '_ext']['normal'] |
|
|
|
)['same_registrar_count'] |
|
|
|
|
|
|
|
|
|
|
|
other_registrar_counts += self.get_registrar_count( |
|
|
|
domain_registrar['domain_registrar'], |
|
|
|
html_element[k][v + '_ext']['normal'] |
|
|
|
)['other_registrar_count'] |
|
|
|
|
|
|
|
|
|
|
|
registrar_counts = { |
|
|
|
'same_registrar_count': same_registrar_counts, |
|
|
|
'other_registrar_count': other_registrar_counts |
|
|
|
} |
|
|
|
return registrar_counts |
|
|
|
|
|
|
|
|
|
|
|
# Avoid unnecessary nesting of the following data |
|
|
|
data_simple.update(domain_registrar) |
|
|
|
data_simple.update(domaintitle_match) |
|
|
|
data_simple.update(iframes_count) |
|
|
|
data_simple.update(multidot_urls_count) |
|
|
|
data_simple.update(get_total_registrars()) |
|
|
|
|
|
|
|
|
|
|
|
url_data = dict({ |
|
|
|
url: [ |
|
|
|
data_simple, |
|
|
|
startfinal_url, |
|
|
|
{'redirects': redirect_url}, |
|
|
|
|
|
|
|
|
|
|
|
domain_time_relative, |
|
|
|
domain_time, |
|
|
|
|
|
|
|
|
|
|
|
{'webpage_data': [ |
|
|
|
html_element_iframe, |
|
|
|
html_element_a_href, |
|
|
@ -653,7 +653,7 @@ class json_url_data(object): |
|
|
|
} |
|
|
|
] |
|
|
|
}) |
|
|
|
|
|
|
|
|
|
|
|
return url_data |
|
|
|
|
|
|
|
|
|
|
@ -667,11 +667,11 @@ class write_operations(object): |
|
|
|
""" |
|
|
|
Set JSON file name, append number suffix |
|
|
|
# if file exists already. |
|
|
|
|
|
|
|
|
|
|
|
Returns file name path. |
|
|
|
""" |
|
|
|
def set_filename(self): |
|
|
|
|
|
|
|
|
|
|
|
c = 0 |
|
|
|
while True: |
|
|
|
if os.path.exists(self.filename): |
|
|
@ -689,7 +689,7 @@ class write_operations(object): |
|
|
|
Append to a JSON file. |
|
|
|
""" |
|
|
|
def write_to_file(self, data): |
|
|
|
|
|
|
|
|
|
|
|
try: |
|
|
|
json_file = open(self.filename, "a") |
|
|
|
json_file.write(data) |
|
|
@ -762,7 +762,7 @@ class data_visualization(object): |
|
|
|
'local_urls': link_count(unique_refs, '_self'), |
|
|
|
'external_urls': link_count(unique_refs, '_ext') |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
return data |
|
|
|
|
|
|
|
def get_registrars(self): |
|
|
@ -778,7 +778,7 @@ class data_visualization(object): |
|
|
|
return registrars |
|
|
|
|
|
|
|
def get_registrar_count_summary(self): |
|
|
|
|
|
|
|
|
|
|
|
domain_counter = dict(Counter(self.get_registrars())) |
|
|
|
data = {'fetched_domains': domain_counter, 'url_domain_registrar': self.domain_registrar } |
|
|
|
return data |
|
|
@ -807,7 +807,7 @@ if __name__ == '__main__': |
|
|
|
|
|
|
|
# Get URLs from an available JSON data |
|
|
|
for key_url in json_data.keys(): |
|
|
|
|
|
|
|
|
|
|
|
print("Generating statistics: %s" % key_url) |
|
|
|
|
|
|
|
fig = plt.figure() |
|
|
@ -817,19 +817,19 @@ if __name__ == '__main__': |
|
|
|
# 'figure.constrained_layout.use': True |
|
|
|
} |
|
|
|
plt.rcParams.update(fig_params) |
|
|
|
|
|
|
|
|
|
|
|
domain_string = url_str_pattern.split(key_url)[2].replace('.','') |
|
|
|
summary = data_visualization(key_url, json_data) |
|
|
|
|
|
|
|
|
|
|
|
summary_registrars = summary.get_registrar_count_summary()['fetched_domains'] |
|
|
|
|
|
|
|
x_r = list(summary_registrars.keys()) |
|
|
|
y_r = list(summary_registrars.values()) |
|
|
|
|
|
|
|
|
|
|
|
# Show bar values |
|
|
|
for index,data in enumerate(y_r): |
|
|
|
plt.text(x=index, y=data+0.5, s=data, fontdict=dict(fontsize=8)) |
|
|
|
|
|
|
|
|
|
|
|
title_r = "Domains associated with HTML URL data (" + key_url + ")" |
|
|
|
xlabel_r = "Fetched domains" |
|
|
|
ylabel_r = "Domain count" |
|
|
@ -845,18 +845,17 @@ if __name__ == '__main__': |
|
|
|
plt.show() |
|
|
|
|
|
|
|
#fig_u = plt.figure() |
|
|
|
|
|
|
|
|
|
|
|
#summary_urls = summary.get_urls_count_summary() |
|
|
|
|
|
|
|
|
|
|
|
#x_u = list(summary_urls.keys()) |
|
|
|
#y_u = list(summary_urls.values()) |
|
|
|
#title_u = "Local and external URL references (" + key_url + ")" |
|
|
|
#xlabel_u = "Fetched URLs" |
|
|
|
#ylabel_u = "URL count" |
|
|
|
|
|
|
|
|
|
|
|
#plt.bar(x_u, y_u, color="blue", edgecolor='black') |
|
|
|
#plt.title(title_u) |
|
|
|
#plt.xlabel(xlabel_u) |
|
|
|
#plt.ylabel(ylabel_u) |
|
|
|
#plt.show() |
|
|
|
|