diff --git a/code/url-analyzer.py b/code/url-analyzer.py index 7201910..93db4a0 100755 --- a/code/url-analyzer.py +++ b/code/url-analyzer.py @@ -96,21 +96,21 @@ class json_url_data(object): Returns a requests.models.Response object. """ def set_session(self, url, method='get', redirects=True): - + # HTTP response status codes 1XX, 2XX and 3XX are OK # Treat other codes as errors sc = re.compile(r"^[123]{1}[0-9]{2}") - + sleep(sleep_interval_between_requests) - + try: session = requests.Session() response = session.request(method, url, headers=request_headers, allow_redirects=redirects) - + if not sc.match(str(response.status_code)): raise Exception("Error: got invalid response status from the web server") return response - + except: raise Exception("Error: HTTP session could not be established. URL: '" + url + "' (method: " + method + ")") from None @@ -121,7 +121,7 @@ class json_url_data(object): Returns a bs4.BeautifulSoup object. """ def get_html_data(self, url): - + try: data = bs(self.set_session(url).content, 'html.parser') return data @@ -135,15 +135,15 @@ class json_url_data(object): Returns a list object. """ def get_url_redirects(self, url): - + response = self.set_session(url) list_data = [] - + if response.history: - + for r in response.history: list_data.append({'redirect_url': r.url, 'status': r.status_code}) - + return list_data ###################################### @@ -153,9 +153,9 @@ class json_url_data(object): Returns a string object. """ def get_webpage_title(self, url): - + html_data = self.get_html_data(url) - + title = html_data.title.string return title @@ -175,7 +175,7 @@ class json_url_data(object): """ def get_domain_name(self, url): domain_name = self.get_whois_data(url).domain_name - + if type(domain_name) is list: return domain_name[0].lower() else: @@ -184,29 +184,29 @@ class json_url_data(object): ###################################### """ Get initial and final URLs - + Compare whether the final (destination) URL matches with the initial URL in a request. - + Returns a dict object. """ def get_startfinal_urls(self, url): - + response = self.set_session(url) end_url = response.url - + start_match = False final_match = False - + # dr = re.compile(r"^([a-z]+://)?([^/]+)") # dr_group_lastindex = dr.match(url).lastindex # domain_name = dr.match(url).group(dr_group_lastindex) - + domain_name = self.get_domain_name(url) - + if re.search(domain_name, end_url): final_match = True - + dict_data = { 'startfinal_urls': { 'start_url': { @@ -217,13 +217,13 @@ class json_url_data(object): } } } - + return dict_data ###################################### """ Get domain registrar - + Returns a dict object. """ def get_domain_registrar(self, url): @@ -235,21 +235,21 @@ class json_url_data(object): Do comparison between the domain name, extracted from WHOIS domain data and contents of a title HTML element, extracted from HTML data based on a given URL. - + Returns a dict object. """ def get_domain_title_match(self, url): - + domain_name = self.get_domain_name(url) title = self.get_webpage_title(url) - + # If is string: if type(domain_name) is str: if re.search(domain_name, title, re.IGNORECASE): match = True else: match = False - + # If is list: elif type(domain_name) is list: for d in domain_name: @@ -260,49 +260,49 @@ class json_url_data(object): match = False else: match = False - + dict_data = { 'webpage_title': title, 'domain_in_webpage_title': match } - + return dict_data ###################################### """ Get a single timestamp from given data - + Two scenarios are considered: dates argument is either a list or a string. If it is a list, then we need to decide which date value to extract. - + Returns a date object. """ def get_single_date(self, dates, newest=False): - + dates_epoch = [] - + if type(dates) is list: for d in dates: dates_epoch.append(d.timestamp()) else: dates_epoch.append(dates.timestamp()) - + return datetime.fromtimestamp(sorted(dates_epoch, reverse=newest)[0]) ###################################### """ Get domain time information based on WHOIS domain data. - + Returns a dict object. """ def get_domain_timeinfo(self, url): - + whois_data = self.get_whois_data(url) domain_creation_date = self.get_single_date(whois_data.creation_date, newest = False) domain_updated_date = self.get_single_date(whois_data.updated_date, newest = False) domain_expiration_date = self.get_single_date(whois_data.expiration_date, newest = False) - + dict_data = { 'domain_timestamps': { @@ -311,25 +311,25 @@ class json_url_data(object): 'expires': domain_expiration_date.strftime(dateformat) } } - + return dict_data ###################################### """ Get domain time information based on WHOIS domain data, relative to the current date (UTC time). - + Returns a dict object. """ def get_domain_timeinfo_relative(self, url): - + date_now = datetime.utcnow() - + whois_data = self.get_whois_data(url) domain_creation_date = self.get_single_date(whois_data.creation_date, newest = False) domain_updated_date = self.get_single_date(whois_data.updated_date, newest = False) domain_expiration_date = self.get_single_date(whois_data.expiration_date, newest = False) - + dict_data = { 'domain_timestamps_relative': { @@ -339,7 +339,7 @@ class json_url_data(object): 'expires_days_left': (domain_expiration_date - date_now).days } } - + return dict_data ###################################### @@ -348,15 +348,15 @@ class json_url_data(object): '../foo/bar/' '/foo/../../bar/, 'https://foo.bar/foo/../' - + etc. - + Returns a boolean object. """ def is_multidot_url(self, url): - + multidot = re.compile(r".*[.]{2}/.*") - + if multidot.match(url): return True return False @@ -364,13 +364,13 @@ class json_url_data(object): ###################################### """ Get HTML element data from HTML data contents. - + Two fetching methods are supported: - A) use only HTML element/tag name and extract raw contents of these tags - B) use both HTML element/tag name and more fine-grained inner attribute name to determine which HTML elements are extracted - + Special case - URL link references: - attributes 'href' or 'src' are considered as link referrals and they are handled in a special way @@ -378,55 +378,55 @@ class json_url_data(object): (patterns: '/', '#', '../' and '/') - B) link referrals to external domains are placed in 'ext_refs' list (patterns such as 'https://foo.bar.dot/fancysite' etc.) - + - Both A) and B) link categories have 'normal' and 'multidot' subcategories - normal links do not contain pattern '../' - multidot links contain '../' pattern - + Returns a dict object. """ - + def get_tag_data(self, url, tag, attribute=None): - + html_data = self.get_html_data(url) domain_name = self.get_domain_name(url) data = [] - + if attribute != None: - + for d in html_data.find_all(tag): - + # Ignore the HTML tag if it does not contain our attribute if d.get(attribute) != None: data.append(d.get(attribute)) - + if attribute == 'href' or attribute == 'src': - + self_refs = { 'normal': [], 'multidot': []} ext_refs = { 'normal': [], 'multidot': []} - + # Syntax: '#', '/', '../' rs = re.compile(r"^[/#]|^[.]{2}/.*") - + # Syntax: ':/' rd = re.compile(r"^[a-z]+:[a-z]+/") - + # Syntax examples: # 'http://foo.bar/', 'https://foo.bar/, 'foo.bar/', 'https://virus.foo.bar/' rl = re.compile(r"^([a-z]+://)?([^/]*" + domain_name + "/)") - + for s in data: - + # Ignore mailto links if re.match("^mailto:", s): continue - + if rs.match(s) or rl.match(s) or rd.match(s): if self.is_multidot_url(s): self_refs['multidot'].append(s) else: self_refs['normal'].append(s) else: - + if self.is_multidot_url(s): try: ext_refs['multidot'].append({'url': s, 'registrar': self.get_whois_data(s).registrar }) @@ -440,31 +440,31 @@ class json_url_data(object): except: ext_refs['normal'].append({'url': s, 'registrar': None }) pass - + data = None - + dict_data = { tag: { attribute + '_ext': (ext_refs), attribute + '_self': (self_refs) } } - + else: dict_data = { tag: { attribute: (data) } } - + else: for d in html_data.find_all(tag): data.append(d.prettify()) - + dict_data = { tag: (data) } - + return dict_data ###################################### @@ -473,21 +473,21 @@ class json_url_data(object): the webpage itself? """ def get_registrar_count(self, registrar, urls): - + i = 0 - + for u in urls: for k,v in u.items(): if k == 'registrar' and v == registrar: i += 1 - + o = len(urls) - i - + dict_data = { 'same_registrar_count': i, 'other_registrar_count': o } - + return dict_data ###################################### @@ -495,9 +495,9 @@ class json_url_data(object): """ Get values existing in a dict object, based on a known key string. - + Returns a list object. - + TODO: Major re-work for the fetch function TODO: Support for more sophisticated JSON key string filtering @@ -563,7 +563,7 @@ class json_url_data(object): if flatten == True: for u in get_data_extract(data_extract): flat_data.append(u) - + return flat_data else: return data_extract @@ -573,21 +573,21 @@ class json_url_data(object): Compile URL related data. """ def get_url_data(self, url): - + # Dict object for simple, non-nested data data_simple = {} # Pre-defined dict object for specific data sets webpage_data = {} - + startfinal_url = self.get_startfinal_urls(url) redirect_url = self.get_url_redirects(url) domain_registrar = self.get_domain_registrar(url) domaintitle_match = self.get_domain_title_match(url) - + domain_time_relative = self.get_domain_timeinfo_relative(url) domain_time = self.get_domain_timeinfo(url) - + html_element_iframe = self.get_tag_data(url, 'iframe') html_element_a_href = self.get_tag_data(url, 'a', link_refs['a']) html_element_img_src = self.get_tag_data(url, 'img', link_refs['img']) @@ -597,53 +597,53 @@ class json_url_data(object): 'iframes_count': len(self.json_fetcher(html_element_iframe, 'iframe').get_data()) } - + multidot_urls_count = { 'multidot_url_count': len(self.json_fetcher(html_element_a_href, 'multidot').get_data()) + len(self.json_fetcher(html_element_img_src, 'multidot').get_data()) + len(self.json_fetcher(html_element_script_src, 'multidot').get_data()) } - + ################### def get_total_registrars(): same_registrar_counts = 0 other_registrar_counts = 0 for k,v in link_refs.items(): - + html_element = self.get_tag_data(url, k, v) - + same_registrar_counts += self.get_registrar_count( domain_registrar['domain_registrar'], html_element[k][v + '_ext']['normal'] )['same_registrar_count'] - + other_registrar_counts += self.get_registrar_count( domain_registrar['domain_registrar'], html_element[k][v + '_ext']['normal'] )['other_registrar_count'] - + registrar_counts = { 'same_registrar_count': same_registrar_counts, 'other_registrar_count': other_registrar_counts } return registrar_counts - + # Avoid unnecessary nesting of the following data data_simple.update(domain_registrar) data_simple.update(domaintitle_match) data_simple.update(iframes_count) data_simple.update(multidot_urls_count) data_simple.update(get_total_registrars()) - + url_data = dict({ url: [ data_simple, startfinal_url, {'redirects': redirect_url}, - + domain_time_relative, domain_time, - + {'webpage_data': [ html_element_iframe, html_element_a_href, @@ -653,7 +653,7 @@ class json_url_data(object): } ] }) - + return url_data @@ -667,11 +667,11 @@ class write_operations(object): """ Set JSON file name, append number suffix # if file exists already. - + Returns file name path. """ def set_filename(self): - + c = 0 while True: if os.path.exists(self.filename): @@ -689,7 +689,7 @@ class write_operations(object): Append to a JSON file. """ def write_to_file(self, data): - + try: json_file = open(self.filename, "a") json_file.write(data) @@ -762,7 +762,7 @@ class data_visualization(object): 'local_urls': link_count(unique_refs, '_self'), 'external_urls': link_count(unique_refs, '_ext') } - + return data def get_registrars(self): @@ -778,7 +778,7 @@ class data_visualization(object): return registrars def get_registrar_count_summary(self): - + domain_counter = dict(Counter(self.get_registrars())) data = {'fetched_domains': domain_counter, 'url_domain_registrar': self.domain_registrar } return data @@ -807,7 +807,7 @@ if __name__ == '__main__': # Get URLs from an available JSON data for key_url in json_data.keys(): - + print("Generating statistics: %s" % key_url) fig = plt.figure() @@ -817,19 +817,19 @@ if __name__ == '__main__': # 'figure.constrained_layout.use': True } plt.rcParams.update(fig_params) - + domain_string = url_str_pattern.split(key_url)[2].replace('.','') summary = data_visualization(key_url, json_data) - + summary_registrars = summary.get_registrar_count_summary()['fetched_domains'] x_r = list(summary_registrars.keys()) y_r = list(summary_registrars.values()) - + # Show bar values for index,data in enumerate(y_r): plt.text(x=index, y=data+0.5, s=data, fontdict=dict(fontsize=8)) - + title_r = "Domains associated with HTML URL data (" + key_url + ")" xlabel_r = "Fetched domains" ylabel_r = "Domain count" @@ -845,18 +845,17 @@ if __name__ == '__main__': plt.show() #fig_u = plt.figure() - + #summary_urls = summary.get_urls_count_summary() - + #x_u = list(summary_urls.keys()) #y_u = list(summary_urls.values()) #title_u = "Local and external URL references (" + key_url + ")" #xlabel_u = "Fetched URLs" #ylabel_u = "URL count" - + #plt.bar(x_u, y_u, color="blue", edgecolor='black') #plt.title(title_u) #plt.xlabel(xlabel_u) #plt.ylabel(ylabel_u) #plt.show() -