URL data analyzer and extractor. Detect malicious signs and other useful data associated with URLs.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

861 lines
22 KiB

  1. #!/bin/env python
  2. """
  3. URL data extractor
  4. Pekka Helenius <pekka [dot] helenius [at] fjordtek [dot] com>
  5. Requirements:
  6. Python 3
  7. Python 3 BeautifulSoup4 (python-beautifulsoup4)
  8. Python 3 whois (python-whois; PyPI)
  9. Python 3 JSON Schema (python-jsonschema)
  10. Python 3 Numpy (python-numpy)
  11. Python 3 matplotlib (python-matplotlib)
  12. TODO: URL domain part length comparison analysis
  13. TODO: URL non-TLD part length comparison analysis
  14. - in phishing webpages, URL tends to be much longer than legitimate webpages
  15. however, domains themselves tend to be much shorter (without TLD)
  16. - phishing URLs often contain more number of dots and subdomains than legitimate URLs
  17. - legitimate: robots.txt redirects bots to a legitimate domain rather than to the original phishing domain
  18. TODO: Website visual similarity analysis
  19. TODO: consistency of RDN usage in HTML data
  20. """
  21. ######################################
  22. #%matplotlib inline
  23. import matplotlib.pyplot as plt
  24. from bs4 import BeautifulSoup as bs
  25. from collections import Counter
  26. from datetime import date, datetime
  27. import json
  28. import os
  29. import re
  30. import requests
  31. from time import sleep
  32. import urllib
  33. from whois import whois
  34. # Target URLs
  35. urls = [
  36. "https://hoxhunt.com/",
  37. "https://hs.fi",
  38. "https://ts.fi",
  39. "https://facebook.com"
  40. ]
  41. # Some web servers may block our request unless we set a widely used, well-known user agent string
  42. request_headers = {
  43. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'
  44. }
  45. # Date format for domain timestamps
  46. dateformat = "%Y/%m/%d"
  47. # All webpages may not like fetching data too fast
  48. # Sleep time in seconds
  49. sleep_interval_between_requests = 0.5
  50. # Write JSON results to a file?
  51. use_file = True
  52. # Full file path + name
  53. filename = os.getcwd() + "/" + "url_info.json"
  54. # Generate plot from existing JSON data?
  55. plot_only = False
  56. # Save generated plot images?
  57. save_plot_images = True
  58. # DPI of plot images
  59. plot_images_dpi = 150
  60. # Common link attribute references in various HTML elements
  61. link_refs = {
  62. 'a': 'href',
  63. 'img': 'src',
  64. 'script': 'src'
  65. }
  66. ############################################################################
  67. ############################################################################
  68. class json_url_data(object):
  69. # def __init__(self):
  70. ######################################
  71. """
  72. Set a new HTTP session and get response.
  73. Returns a requests.models.Response object.
  74. """
  75. def set_session(self, url, method='get', redirects=True):
  76. # HTTP response status codes 1XX, 2XX and 3XX are OK
  77. # Treat other codes as errors
  78. sc = re.compile(r"^[123]{1}[0-9]{2}")
  79. sleep(sleep_interval_between_requests)
  80. try:
  81. session = requests.Session()
  82. response = session.request(method, url, headers=request_headers, allow_redirects=redirects)
  83. if not sc.match(str(response.status_code)):
  84. raise Exception("Error: got invalid response status from the web server")
  85. return response
  86. except:
  87. raise Exception("Error: HTTP session could not be established. URL: '" + url + "' (method: " + method + ")") from None
  88. ######################################
  89. """
  90. Fetch HTML data.
  91. Returns a bs4.BeautifulSoup object.
  92. """
  93. def get_html_data(self, url):
  94. try:
  95. data = bs(self.set_session(url).content, 'html.parser')
  96. return data
  97. except:
  98. raise Exception("Error: HTML data could not be retrieved")
  99. ######################################
  100. """
  101. Get URL redirects and related HTTP status codes.
  102. Returns a list object.
  103. """
  104. def get_url_redirects(self, url):
  105. response = self.set_session(url)
  106. list_data = []
  107. if response.history:
  108. for r in response.history:
  109. list_data.append({'redirect_url': r.url, 'status': r.status_code})
  110. return list_data
  111. ######################################
  112. """
  113. Extract title HTML element contents from given HTML data.
  114. Returns a string object.
  115. """
  116. def get_webpage_title(self, url):
  117. html_data = self.get_html_data(url)
  118. title = html_data.title.string
  119. return title
  120. ######################################
  121. """
  122. Get WHOIS domain data.
  123. Returns a dict object.
  124. """
  125. def get_whois_data(self, url):
  126. dict_data = whois(url)
  127. return dict_data
  128. ######################################
  129. """
  130. Get domain name based on WHOIS domain data.
  131. """
  132. def get_domain_name(self, url):
  133. domain_name = self.get_whois_data(url).domain_name
  134. if type(domain_name) is list:
  135. return domain_name[0].lower()
  136. else:
  137. return domain_name.lower()
  138. ######################################
  139. """
  140. Get initial and final URLs
  141. Compare whether the final (destination) URL
  142. matches with the initial URL in a request.
  143. Returns a dict object.
  144. """
  145. def get_startfinal_urls(self, url):
  146. response = self.set_session(url)
  147. end_url = response.url
  148. start_match = False
  149. final_match = False
  150. # dr = re.compile(r"^([a-z]+://)?([^/]+)")
  151. # dr_group_lastindex = dr.match(url).lastindex
  152. # domain_name = dr.match(url).group(dr_group_lastindex)
  153. domain_name = self.get_domain_name(url)
  154. if re.search(domain_name, end_url):
  155. final_match = True
  156. dict_data = {
  157. 'startfinal_urls': {
  158. 'start_url': {
  159. 'url': url
  160. },
  161. 'final_url': {
  162. 'url': end_url, 'domain_match': final_match
  163. }
  164. }
  165. }
  166. return dict_data
  167. ######################################
  168. """
  169. Get domain registrar
  170. Returns a dict object.
  171. """
  172. def get_domain_registrar(self, url):
  173. dict_data = {'domain_registrar': self.get_whois_data(url).registrar }
  174. return dict_data
  175. ######################################
  176. """
  177. Do comparison between the domain name, extracted
  178. from WHOIS domain data and contents of a title HTML
  179. element, extracted from HTML data based on a given URL.
  180. Returns a dict object.
  181. """
  182. def get_domain_title_match(self, url):
  183. domain_name = self.get_domain_name(url)
  184. title = self.get_webpage_title(url)
  185. # If is string:
  186. if type(domain_name) is str:
  187. if re.search(domain_name, title, re.IGNORECASE):
  188. match = True
  189. else:
  190. match = False
  191. # If is list:
  192. elif type(domain_name) is list:
  193. for d in domain_name:
  194. if re.search(d, title, re.IGNORECASE):
  195. match = True
  196. break
  197. else:
  198. match = False
  199. else:
  200. match = False
  201. dict_data = {
  202. 'webpage_title': title,
  203. 'domain_in_webpage_title': match
  204. }
  205. return dict_data
  206. ######################################
  207. """
  208. Get a single timestamp from given data
  209. Two scenarios are considered: dates argument is either
  210. a list or a string. If it is a list, then we need
  211. to decide which date value to extract.
  212. Returns a date object.
  213. """
  214. def get_single_date(self, dates, newest=False):
  215. dates_epoch = []
  216. if type(dates) is list:
  217. for d in dates:
  218. dates_epoch.append(d.timestamp())
  219. else:
  220. dates_epoch.append(dates.timestamp())
  221. return datetime.fromtimestamp(sorted(dates_epoch, reverse=newest)[0])
  222. ######################################
  223. """
  224. Get domain time information based on WHOIS domain data.
  225. Returns a dict object.
  226. """
  227. def get_domain_timeinfo(self, url):
  228. whois_data = self.get_whois_data(url)
  229. domain_creation_date = self.get_single_date(whois_data.creation_date, newest = False)
  230. domain_updated_date = self.get_single_date(whois_data.updated_date, newest = False)
  231. domain_expiration_date = self.get_single_date(whois_data.expiration_date, newest = False)
  232. dict_data = {
  233. 'domain_timestamps':
  234. {
  235. 'created': domain_creation_date.strftime(dateformat),
  236. 'updated': domain_updated_date.strftime(dateformat),
  237. 'expires': domain_expiration_date.strftime(dateformat)
  238. }
  239. }
  240. return dict_data
  241. ######################################
  242. """
  243. Get domain time information based on WHOIS domain data,
  244. relative to the current date (UTC time).
  245. Returns a dict object.
  246. """
  247. def get_domain_timeinfo_relative(self, url):
  248. date_now = datetime.utcnow()
  249. whois_data = self.get_whois_data(url)
  250. domain_creation_date = self.get_single_date(whois_data.creation_date, newest = False)
  251. domain_updated_date = self.get_single_date(whois_data.updated_date, newest = False)
  252. domain_expiration_date = self.get_single_date(whois_data.expiration_date, newest = False)
  253. dict_data = {
  254. 'domain_timestamps_relative':
  255. {
  256. 'current_date': (date_now.strftime(dateformat)),
  257. 'created_days_ago': (date_now - domain_creation_date).days,
  258. 'updated_days_ago': (date_now - domain_updated_date).days,
  259. 'expires_days_left': (domain_expiration_date - date_now).days
  260. }
  261. }
  262. return dict_data
  263. ######################################
  264. """
  265. Determine whether URL matches syntaxes such as
  266. '../foo/bar/'
  267. '/foo/../../bar/,
  268. 'https://foo.bar/foo/../'
  269. etc.
  270. Returns a boolean object.
  271. """
  272. def is_multidot_url(self, url):
  273. multidot = re.compile(r".*[.]{2}/.*")
  274. if multidot.match(url):
  275. return True
  276. return False
  277. ######################################
  278. """
  279. Get HTML element data from HTML data contents.
  280. Two fetching methods are supported:
  281. - A) use only HTML element/tag name and extract raw contents of
  282. these tags
  283. - B) use both HTML element/tag name and more fine-grained
  284. inner attribute name to determine which HTML elements are extracted
  285. Special case - URL link references:
  286. - attributes 'href' or 'src' are considered as link referrals and
  287. they are handled in a special way
  288. - A) link referrals to directly to domain are placed in 'self_refs' list
  289. (patterns: '/', '#', '../' and '/<anything>')
  290. - B) link referrals to external domains are placed in 'ext_refs' list
  291. (patterns such as 'https://foo.bar.dot/fancysite' etc.)
  292. - Both A) and B) link categories have 'normal' and 'multidot' subcategories
  293. - normal links do not contain pattern '../'
  294. - multidot links contain '../' pattern
  295. Returns a dict object.
  296. """
  297. def get_tag_data(self, url, tag, attribute=None):
  298. html_data = self.get_html_data(url)
  299. domain_name = self.get_domain_name(url)
  300. data = []
  301. if attribute != None:
  302. for d in html_data.find_all(tag):
  303. # Ignore the HTML tag if it does not contain our attribute
  304. if d.get(attribute) != None:
  305. data.append(d.get(attribute))
  306. if attribute == 'href' or attribute == 'src':
  307. self_refs = { 'normal': [], 'multidot': []}
  308. ext_refs = { 'normal': [], 'multidot': []}
  309. # Syntax: '#<anything>', '/<anything>', '../<anything>'
  310. rs = re.compile(r"^[/#]|^[.]{2}/.*")
  311. # Syntax: '<text>:<text>/'
  312. rd = re.compile(r"^[a-z]+:[a-z]+/")
  313. # Syntax examples:
  314. # 'http://foo.bar/', 'https://foo.bar/, 'foo.bar/', 'https://virus.foo.bar/'
  315. rl = re.compile(r"^([a-z]+://)?([^/]*" + domain_name + "/)")
  316. for s in data:
  317. # Ignore mailto links
  318. if re.match("^mailto:", s): continue
  319. if rs.match(s) or rl.match(s) or rd.match(s):
  320. if self.is_multidot_url(s):
  321. self_refs['multidot'].append(s)
  322. else:
  323. self_refs['normal'].append(s)
  324. else:
  325. if self.is_multidot_url(s):
  326. try:
  327. ext_refs['multidot'].append({'url': s, 'registrar': self.get_whois_data(s).registrar })
  328. except:
  329. # Fallback if WHOIS query fails
  330. ext_refs['normal'].append({'url': s, 'registrar': None })
  331. pass
  332. else:
  333. try:
  334. ext_refs['normal'].append({'url': s, 'registrar': self.get_whois_data(s).registrar })
  335. except:
  336. ext_refs['normal'].append({'url': s, 'registrar': None })
  337. pass
  338. data = None
  339. dict_data = {
  340. tag: {
  341. attribute + '_ext': (ext_refs),
  342. attribute + '_self': (self_refs)
  343. }
  344. }
  345. else:
  346. dict_data = {
  347. tag: {
  348. attribute: (data)
  349. }
  350. }
  351. else:
  352. for d in html_data.find_all(tag):
  353. data.append(d.prettify())
  354. dict_data = {
  355. tag: (data)
  356. }
  357. return dict_data
  358. ######################################
  359. """
  360. How many external URL links have same registrar than
  361. the webpage itself?
  362. """
  363. def get_registrar_count(self, registrar, urls):
  364. i = 0
  365. for u in urls:
  366. for k,v in u.items():
  367. if k == 'registrar' and v == registrar:
  368. i += 1
  369. o = len(urls) - i
  370. dict_data = {
  371. 'same_registrar_count': i,
  372. 'other_registrar_count': o
  373. }
  374. return dict_data
  375. ######################################
  376. """
  377. Get values existing in a dict object,
  378. based on a known key string.
  379. Returns a list object.
  380. TODO: Major re-work for the fetch function
  381. TODO: Support for more sophisticated JSON key string filtering
  382. (possibility to use multiple keys for filtering)
  383. """
  384. class json_fetcher(object):
  385. def __init__(self, dict_data, json_key):
  386. self.json_dict = json.loads(json.dumps(dict_data))
  387. self.json_key = json_key
  388. ##########
  389. # Ref: https://www.codespeedy.com/how-to-loop-through-json-with-subkeys-in-python/
  390. def fetch(self, jdata):
  391. if isinstance(jdata, dict):
  392. for k,v in jdata.items():
  393. if k == self.json_key:
  394. yield v
  395. elif isinstance(v, dict):
  396. for val in self.fetch(v):
  397. yield val
  398. elif isinstance(v, list):
  399. for l in v:
  400. if isinstance(l, dict):
  401. for ka,va in l.items():
  402. if ka == self.json_key:
  403. yield va
  404. elif isinstance(jdata, list):
  405. for l in jdata:
  406. if isinstance(l, dict):
  407. for k,v in l.items():
  408. if k == self.json_key:
  409. yield v
  410. elif isinstance(l, list):
  411. for lb in v:
  412. for ka,va in lb.items():
  413. if ka == self.json_key:
  414. yield va
  415. ##########
  416. def get_data(self, flatten=True):
  417. data_extract = []
  418. flat_data = []
  419. for i in self.fetch(self.json_dict):
  420. data_extract.append(i)
  421. # Flatten possible nested lists
  422. # (i.e. JSON data contains multiple keys in
  423. # different nested sections)
  424. def get_data_extract(ld):
  425. for l in ld:
  426. if isinstance(l, list):
  427. for la in get_data_extract(l):
  428. yield la
  429. else:
  430. yield l
  431. if flatten == True:
  432. for u in get_data_extract(data_extract):
  433. flat_data.append(u)
  434. return flat_data
  435. else:
  436. return data_extract
  437. ######################################
  438. """
  439. Compile URL related data.
  440. """
  441. def get_url_data(self, url):
  442. # Dict object for simple, non-nested data
  443. data_simple = {}
  444. # Pre-defined dict object for specific data sets
  445. webpage_data = {}
  446. startfinal_url = self.get_startfinal_urls(url)
  447. redirect_url = self.get_url_redirects(url)
  448. domain_registrar = self.get_domain_registrar(url)
  449. domaintitle_match = self.get_domain_title_match(url)
  450. domain_time_relative = self.get_domain_timeinfo_relative(url)
  451. domain_time = self.get_domain_timeinfo(url)
  452. html_element_iframe = self.get_tag_data(url, 'iframe')
  453. html_element_a_href = self.get_tag_data(url, 'a', link_refs['a'])
  454. html_element_img_src = self.get_tag_data(url, 'img', link_refs['img'])
  455. html_element_script_src = self.get_tag_data(url, 'script', link_refs['script'])
  456. iframes_count = {
  457. 'iframes_count':
  458. len(self.json_fetcher(html_element_iframe, 'iframe').get_data())
  459. }
  460. multidot_urls_count = {
  461. 'multidot_url_count':
  462. len(self.json_fetcher(html_element_a_href, 'multidot').get_data()) + len(self.json_fetcher(html_element_img_src, 'multidot').get_data()) + len(self.json_fetcher(html_element_script_src, 'multidot').get_data())
  463. }
  464. ###################
  465. def get_total_registrars():
  466. same_registrar_counts = 0
  467. other_registrar_counts = 0
  468. for k,v in link_refs.items():
  469. html_element = self.get_tag_data(url, k, v)
  470. same_registrar_counts += self.get_registrar_count(
  471. domain_registrar['domain_registrar'],
  472. html_element[k][v + '_ext']['normal']
  473. )['same_registrar_count']
  474. other_registrar_counts += self.get_registrar_count(
  475. domain_registrar['domain_registrar'],
  476. html_element[k][v + '_ext']['normal']
  477. )['other_registrar_count']
  478. registrar_counts = {
  479. 'same_registrar_count': same_registrar_counts,
  480. 'other_registrar_count': other_registrar_counts
  481. }
  482. return registrar_counts
  483. # Avoid unnecessary nesting of the following data
  484. data_simple.update(domain_registrar)
  485. data_simple.update(domaintitle_match)
  486. data_simple.update(iframes_count)
  487. data_simple.update(multidot_urls_count)
  488. data_simple.update(get_total_registrars())
  489. url_data = dict({
  490. url: [
  491. data_simple,
  492. startfinal_url,
  493. {'redirects': redirect_url},
  494. domain_time_relative,
  495. domain_time,
  496. {'webpage_data': [
  497. html_element_iframe,
  498. html_element_a_href,
  499. html_element_img_src,
  500. html_element_script_src
  501. ]
  502. }
  503. ]
  504. })
  505. return url_data
  506. class write_operations(object):
  507. def __init__(self):
  508. self.filename = filename
  509. ######################################
  510. """
  511. Set JSON file name, append number suffix
  512. # if file exists already.
  513. Returns file name path.
  514. """
  515. def set_filename(self):
  516. c = 0
  517. while True:
  518. if os.path.exists(self.filename):
  519. if c == 0:
  520. self.filename = self.filename + "." + str(c)
  521. else:
  522. self.filename = re.sub("[0-9]+$", str(c), self.filename)
  523. else:
  524. break
  525. c += 1
  526. return self.filename
  527. ######################################
  528. """
  529. Append to a JSON file.
  530. """
  531. def write_to_file(self, data):
  532. try:
  533. json_file = open(self.filename, "a")
  534. json_file.write(data)
  535. json_file.close()
  536. return 0
  537. except:
  538. return 1
  539. ######################################
  540. """
  541. Fetch all pre-defined URLs.
  542. """
  543. def fetch_and_store_url_data(self, urls, use_file):
  544. data_parts = {}
  545. fetch_json_data = json_url_data()
  546. for u in urls:
  547. print("URL data: %s" % u)
  548. try:
  549. data_parts.update(fetch_json_data.get_url_data(u))
  550. except:
  551. print("Failed: %s" % u)
  552. pass
  553. json_data = json.dumps(data_parts)
  554. if use_file == True:
  555. self.write_to_file(json_data)
  556. return json_data
  557. ######################################
  558. """
  559. Visualize & summarize data.
  560. """
  561. class data_visualization(object):
  562. def __init__(self, url, json_data):
  563. self.url = url
  564. self.json_data = json_data
  565. self.data = json.loads(json.dumps(self.json_data)).get(self.url)
  566. self.json_url_obj = json_url_data()
  567. self.domain_registrar = self.json_url_obj.get_domain_registrar(self.url)['domain_registrar']
  568. self.webpage_data = self.json_url_obj.json_fetcher(self.data, 'webpage_data').get_data()
  569. def get_urls_count_summary(self):
  570. unique_refs = []
  571. for k,v in link_refs.items():
  572. if v in unique_refs: continue
  573. unique_refs.append(v)
  574. def link_count(refs, suffix):
  575. urls_cnt = 0
  576. for u in self.webpage_data:
  577. for l in refs:
  578. urls = self.json_url_obj.json_fetcher(u, l + suffix).get_data()
  579. for n in urls:
  580. urls_cnt += len(n['normal'])
  581. urls_cnt += len(n['multidot'])
  582. return urls_cnt
  583. data = {
  584. 'local_urls': link_count(unique_refs, '_self'),
  585. 'external_urls': link_count(unique_refs, '_ext')
  586. }
  587. return data
  588. def get_registrars(self):
  589. registrars = []
  590. #registrars.append(self.domain_registrar)
  591. for w in self.webpage_data:
  592. webpage_registrars = self.json_url_obj.json_fetcher(w, 'registrar').get_data()
  593. for wa in webpage_registrars:
  594. if wa != None:
  595. registrars.append(wa)
  596. return registrars
  597. def get_registrar_count_summary(self):
  598. domain_counter = dict(Counter(self.get_registrars()))
  599. data = {'fetched_domains': domain_counter, 'url_domain_registrar': self.domain_registrar }
  600. return data
  601. ######################################
  602. """
  603. Execute the main program code.
  604. TODO: this code must figure out the correct JSON file
  605. if multiple generated files are present.
  606. """
  607. if __name__ == '__main__':
  608. if plot_only == False:
  609. write_obj = write_operations()
  610. write_obj.set_filename()
  611. data = write_obj.fetch_and_store_url_data(urls, use_file)
  612. url_str_pattern = re.compile(r"(^[a-z]+://)?([^/]*)")
  613. if os.path.exists(filename):
  614. with open(filename, "r") as json_file:
  615. json_data = json.load(json_file)
  616. else:
  617. json_data = data
  618. # Get URLs from an available JSON data
  619. for key_url in json_data.keys():
  620. print("Generate statistics: %s" % key_url)
  621. fig = plt.figure()
  622. fig_params = {
  623. 'xtick.labelsize': 8,
  624. 'figure.figsize': [9,8]
  625. # 'figure.constrained_layout.use': True
  626. }
  627. plt.rcParams.update(fig_params)
  628. domain_string = url_str_pattern.split(key_url)[2].replace('.','')
  629. summary = data_visualization(key_url, json_data)
  630. summary_registrars = summary.get_registrar_count_summary()['fetched_domains']
  631. x_r = list(summary_registrars.keys())
  632. y_r = list(summary_registrars.values())
  633. # Show bar values
  634. for index,data in enumerate(y_r):
  635. plt.text(x=index, y=data+0.5, s=data, fontdict=dict(fontsize=8))
  636. title_r = "Domains associated with HTML URL data (" + key_url + ")"
  637. xlabel_r = "Fetched domains"
  638. ylabel_r = "Domain count"
  639. plt.bar(x_r, y_r, color="green", edgecolor="black")
  640. plt.title(title_r)
  641. plt.xlabel(xlabel_r)
  642. plt.ylabel(ylabel_r)
  643. plt.xticks(rotation=45, horizontalalignment="right")
  644. if save_plot_images == True:
  645. plt.savefig(os.getcwd() + "/" + "domain_figure_" + domain_string + ".png", dpi=plot_images_dpi)
  646. plt.show()
  647. #fig_u = plt.figure()
  648. #summary_urls = summary.get_urls_count_summary()
  649. #x_u = list(summary_urls.keys())
  650. #y_u = list(summary_urls.values())
  651. #title_u = "Local and external URL references (" + key_url + ")"
  652. #xlabel_u = "Fetched URLs"
  653. #ylabel_u = "URL count"
  654. #plt.bar(x_u, y_u, color="blue", edgecolor='black')
  655. #plt.title(title_u)
  656. #plt.xlabel(xlabel_u)
  657. #plt.ylabel(ylabel_u)
  658. #plt.show()