Code clean-up: remove redundant whitespaces

4 years ago · 3df3cb660d
--- a/code/url-analyzer.py
+++ b/code/url-analyzer.py
@ -96,21 +96,21 @@ class json_url_data(object):
  Returns a requests.models.Response object.
  """
  def set_session(self, url, method='get', redirects=True):
  

    # HTTP response status codes 1XX, 2XX and 3XX are OK
    # Treat other codes as errors
    sc = re.compile(r"^[123]{1}[0-9]{2}")
  

    sleep(sleep_interval_between_requests)
  

    try:
      session  = requests.Session()
      response = session.request(method, url, headers=request_headers, allow_redirects=redirects)
  

      if not sc.match(str(response.status_code)):
        raise Exception("Error: got invalid response status from the web server")
      return response
  

    except:
      raise Exception("Error: HTTP session could not be established. URL: '" + url + "' (method: " + method + ")") from None

@ -121,7 +121,7 @@ class json_url_data(object):
  Returns a bs4.BeautifulSoup object.
  """
  def get_html_data(self, url):
  

    try:
      data = bs(self.set_session(url).content, 'html.parser')
      return data
@ -135,15 +135,15 @@ class json_url_data(object):
  Returns a list object.
  """
  def get_url_redirects(self, url):
  

    response = self.set_session(url)
    list_data = []
  

    if response.history:
      

      for r in response.history:
        list_data.append({'redirect_url': r.url, 'status': r.status_code})
  

    return list_data

 ######################################
@ -153,9 +153,9 @@ class json_url_data(object):
  Returns a string object.
  """
  def get_webpage_title(self, url):
  

    html_data = self.get_html_data(url)
  

    title = html_data.title.string
    return title

@ -175,7 +175,7 @@ class json_url_data(object):
  """
  def get_domain_name(self, url):
    domain_name = self.get_whois_data(url).domain_name
  

    if type(domain_name) is list:
      return domain_name[0].lower()
    else:
@ -184,29 +184,29 @@ class json_url_data(object):
 ######################################
  """
  Get initial and final URLs
  

  Compare whether the final (destination) URL
  matches with the initial URL in a request.
  

  Returns a dict object.
  """
  def get_startfinal_urls(self, url):
  

    response    = self.set_session(url)
    end_url     = response.url
  

    start_match = False
    final_match = False
  

  #  dr = re.compile(r"^([a-z]+://)?([^/]+)")
  #  dr_group_lastindex = dr.match(url).lastindex
  #  domain_name = dr.match(url).group(dr_group_lastindex)
  

    domain_name = self.get_domain_name(url)
  

    if re.search(domain_name, end_url):
      final_match = True
  

    dict_data = {
      'startfinal_urls': {
        'start_url': {
@ -217,13 +217,13 @@ class json_url_data(object):
        }
      }
    }
  

    return dict_data

 ######################################
  """
  Get domain registrar
  

  Returns a dict object.
  """
  def get_domain_registrar(self, url):
@ -235,21 +235,21 @@ class json_url_data(object):
  Do comparison between the domain name, extracted
  from WHOIS domain data and contents of a title HTML
  element, extracted from HTML data based on a given URL.
  

  Returns a dict object.
  """
  def get_domain_title_match(self, url):
  

    domain_name = self.get_domain_name(url)
    title = self.get_webpage_title(url)
  

    # If is string:
    if type(domain_name) is str:
      if re.search(domain_name, title, re.IGNORECASE):
        match = True
      else:
        match = False
  

    # If is list:
    elif type(domain_name) is list:
      for d in domain_name:
@ -260,49 +260,49 @@ class json_url_data(object):
          match = False
    else:
      match = False
  

    dict_data = {
      'webpage_title': title,
      'domain_in_webpage_title': match
    }
  

    return dict_data

 ######################################
  """
  Get a single timestamp from given data
  

  Two scenarios are considered: dates argument is either
  a list or a string. If it is a list, then we need
  to decide which date value to extract.
  

  Returns a date object.
  """
  def get_single_date(self, dates, newest=False):
  

    dates_epoch = []
  

    if type(dates) is list:
      for d in dates:
        dates_epoch.append(d.timestamp())
    else:
      dates_epoch.append(dates.timestamp())
  

    return datetime.fromtimestamp(sorted(dates_epoch, reverse=newest)[0])

 ######################################
  """
  Get domain time information based on WHOIS domain data.
  

  Returns a dict object.
  """
  def get_domain_timeinfo(self, url):
  

    whois_data = self.get_whois_data(url)
    domain_creation_date   = self.get_single_date(whois_data.creation_date, newest = False)
    domain_updated_date    = self.get_single_date(whois_data.updated_date, newest = False)
    domain_expiration_date = self.get_single_date(whois_data.expiration_date, newest = False)
  

    dict_data = {
      'domain_timestamps':
        {
@ -311,25 +311,25 @@ class json_url_data(object):
          'expires': domain_expiration_date.strftime(dateformat)
        }
      }
  

    return dict_data

 ######################################
  """
  Get domain time information based on WHOIS domain data,
  relative to the current date (UTC time).
  

  Returns a dict object.
  """
  def get_domain_timeinfo_relative(self, url):
  

    date_now = datetime.utcnow()
  

    whois_data = self.get_whois_data(url)
    domain_creation_date   = self.get_single_date(whois_data.creation_date, newest = False)
    domain_updated_date    = self.get_single_date(whois_data.updated_date, newest = False)
    domain_expiration_date = self.get_single_date(whois_data.expiration_date, newest = False)
  

    dict_data = {
      'domain_timestamps_relative':
        {
@ -339,7 +339,7 @@ class json_url_data(object):
          'expires_days_left': (domain_expiration_date - date_now).days
        }
      }
  

    return dict_data

 ######################################
@ -348,15 +348,15 @@ class json_url_data(object):
  '../foo/bar/'
  '/foo/../../bar/,
  'https://foo.bar/foo/../'
  

  etc.
  

  Returns a boolean object.
  """
  def is_multidot_url(self, url):
  

    multidot = re.compile(r".*[.]{2}/.*")
  

    if multidot.match(url):
      return True
    return False
@ -364,13 +364,13 @@ class json_url_data(object):
 ######################################
  """
  Get HTML element data from HTML data contents.
  

  Two fetching methods are supported:
  - A) use only HTML element/tag name and extract raw contents of
       these tags
  - B) use both HTML element/tag name and more fine-grained
       inner attribute name to determine which HTML elements are extracted
  

      Special case - URL link references:
       - attributes 'href' or 'src' are considered as link referrals and 
         they are handled in a special way
@ -378,55 +378,55 @@ class json_url_data(object):
              (patterns: '/', '#', '../' and '/<anything>')
         - B) link referrals to external domains are placed in 'ext_refs' list
              (patterns such as 'https://foo.bar.dot/fancysite' etc.)
  

         - Both A) and B) link categories have 'normal' and 'multidot' subcategories
           - normal links do not contain pattern '../'
           - multidot links contain '../' pattern
  

  Returns a dict object.
  """
  

  def get_tag_data(self, url, tag, attribute=None):
  

    html_data   = self.get_html_data(url)
    domain_name = self.get_domain_name(url)
    data        = []
  

    if attribute != None:
      

      for d in html_data.find_all(tag):
  

        # Ignore the HTML tag if it does not contain our attribute
        if d.get(attribute) != None:
          data.append(d.get(attribute))
  

      if attribute == 'href' or attribute == 'src':
  

        self_refs = { 'normal': [], 'multidot': []}
        ext_refs  = { 'normal': [], 'multidot': []}
  

        # Syntax: '#<anything>', '/<anything>', '../<anything>'
        rs = re.compile(r"^[/#]|^[.]{2}/.*")
  

        # Syntax: '<text>:<text>/'
        rd = re.compile(r"^[a-z]+:[a-z]+/")
  

        # Syntax examples:
        # 'http://foo.bar/', 'https://foo.bar/, 'foo.bar/', 'https://virus.foo.bar/'
        rl = re.compile(r"^([a-z]+://)?([^/]*" + domain_name + "/)")
  

        for s in data:
  

          # Ignore mailto links
          if re.match("^mailto:", s): continue
  

          if rs.match(s) or rl.match(s) or rd.match(s):
            if self.is_multidot_url(s):
              self_refs['multidot'].append(s)
            else:
              self_refs['normal'].append(s)
          else:
  

            if self.is_multidot_url(s):
              try:
                ext_refs['multidot'].append({'url': s, 'registrar': self.get_whois_data(s).registrar })
@ -440,31 +440,31 @@ class json_url_data(object):
              except:
                ext_refs['normal'].append({'url': s, 'registrar': None })
                pass
  

        data = None
  

        dict_data = {
            tag: {
              attribute + '_ext': (ext_refs),
              attribute + '_self': (self_refs)
            }
          }
  

      else:
        dict_data = {
            tag: {
               attribute: (data)
            }
          }
  

    else:
      for d in html_data.find_all(tag):
        data.append(d.prettify())
  

      dict_data = {
          tag: (data)
        }
  

    return dict_data

 ######################################
@ -473,21 +473,21 @@ class json_url_data(object):
  the webpage itself?
  """
  def get_registrar_count(self, registrar, urls):
  

    i = 0
  

    for u in urls:
      for k,v in u.items():
        if k == 'registrar' and v == registrar:
          i += 1
  

    o = len(urls) - i
  

    dict_data = {
      'same_registrar_count': i,
      'other_registrar_count': o
    }
  

    return dict_data

 ######################################
@ -495,9 +495,9 @@ class json_url_data(object):
  """
  Get values existing in a dict object,
  based on a known key string.
  

  Returns a list object.
  

  TODO: Major re-work for the fetch function

  TODO: Support for more sophisticated JSON key string filtering
@ -563,7 +563,7 @@ class json_url_data(object):
      if flatten == True:
        for u in get_data_extract(data_extract):
          flat_data.append(u)
     

        return flat_data
      else:
        return data_extract
@ -573,21 +573,21 @@ class json_url_data(object):
  Compile URL related data.
  """
  def get_url_data(self, url):
  

    # Dict object for simple, non-nested data
    data_simple = {}

    # Pre-defined dict object for specific data sets
    webpage_data = {}
  

    startfinal_url          = self.get_startfinal_urls(url)
    redirect_url            = self.get_url_redirects(url)
    domain_registrar        = self.get_domain_registrar(url)
    domaintitle_match       = self.get_domain_title_match(url)
  

    domain_time_relative    = self.get_domain_timeinfo_relative(url)
    domain_time             = self.get_domain_timeinfo(url)
  

    html_element_iframe     = self.get_tag_data(url, 'iframe')
    html_element_a_href     = self.get_tag_data(url, 'a', link_refs['a'])
    html_element_img_src    = self.get_tag_data(url, 'img', link_refs['img'])
@ -597,53 +597,53 @@ class json_url_data(object):
        'iframes_count':
          len(self.json_fetcher(html_element_iframe, 'iframe').get_data())
      }
  

    multidot_urls_count = {
        'multidot_url_count':
          len(self.json_fetcher(html_element_a_href, 'multidot').get_data()) + len(self.json_fetcher(html_element_img_src, 'multidot').get_data()) + len(self.json_fetcher(html_element_script_src, 'multidot').get_data())
      }
  

    ###################
    def get_total_registrars():

      same_registrar_counts  = 0
      other_registrar_counts = 0
      for k,v in link_refs.items():
  

        html_element = self.get_tag_data(url, k, v)
  

        same_registrar_counts += self.get_registrar_count(
          domain_registrar['domain_registrar'],
          html_element[k][v + '_ext']['normal']
        )['same_registrar_count']
  

        other_registrar_counts += self.get_registrar_count(
          domain_registrar['domain_registrar'],
          html_element[k][v + '_ext']['normal']
        )['other_registrar_count']
  

      registrar_counts = {
        'same_registrar_count': same_registrar_counts,
        'other_registrar_count': other_registrar_counts
      }
      return registrar_counts
  

    # Avoid unnecessary nesting of the following data
    data_simple.update(domain_registrar)
    data_simple.update(domaintitle_match)
    data_simple.update(iframes_count)
    data_simple.update(multidot_urls_count)
    data_simple.update(get_total_registrars())
  

    url_data = dict({
        url: [
          data_simple,
          startfinal_url,
          {'redirects': redirect_url},
          

          domain_time_relative,
          domain_time,
  

          {'webpage_data': [
              html_element_iframe,
              html_element_a_href,
@ -653,7 +653,7 @@ class json_url_data(object):
          }
        ]
    })
  

    return url_data


@ -667,11 +667,11 @@ class write_operations(object):
  """
  Set JSON file name, append number suffix
  # if file exists already.
  

  Returns file name path.
  """
  def set_filename(self):
  

    c = 0
    while True:
      if os.path.exists(self.filename):
@ -689,7 +689,7 @@ class write_operations(object):
  Append to a JSON file.
  """
  def write_to_file(self, data):
  

    try:
      json_file = open(self.filename, "a")
      json_file.write(data)
@ -762,7 +762,7 @@ class data_visualization(object):
      'local_urls': link_count(unique_refs, '_self'),
      'external_urls': link_count(unique_refs, '_ext')
    }
    

    return data

  def get_registrars(self):
@ -778,7 +778,7 @@ class data_visualization(object):
    return registrars

  def get_registrar_count_summary(self):
    

    domain_counter = dict(Counter(self.get_registrars()))
    data = {'fetched_domains': domain_counter, 'url_domain_registrar': self.domain_registrar }
    return data
@ -807,7 +807,7 @@ if __name__ == '__main__':

  # Get URLs from an available JSON data
  for key_url in json_data.keys():
    

    print("Generating statistics: %s" % key_url)

    fig = plt.figure()
@ -817,19 +817,19 @@ if __name__ == '__main__':
      # 'figure.constrained_layout.use': True
    }
    plt.rcParams.update(fig_params)
    

    domain_string = url_str_pattern.split(key_url)[2].replace('.','')
    summary = data_visualization(key_url, json_data)
    

    summary_registrars = summary.get_registrar_count_summary()['fetched_domains']

    x_r      = list(summary_registrars.keys())
    y_r      = list(summary_registrars.values())
 

    # Show bar values
    for index,data in enumerate(y_r):
      plt.text(x=index, y=data+0.5, s=data, fontdict=dict(fontsize=8))
    

    title_r  = "Domains associated with HTML URL data (" + key_url + ")"
    xlabel_r = "Fetched domains"
    ylabel_r = "Domain count"
@ -845,18 +845,17 @@ if __name__ == '__main__':
    plt.show()

    #fig_u = plt.figure()
    

    #summary_urls = summary.get_urls_count_summary()
    

    #x_u      = list(summary_urls.keys())
    #y_u      = list(summary_urls.values())
    #title_u  = "Local and external URL references (" + key_url + ")"
    #xlabel_u = "Fetched URLs"
    #ylabel_u = "URL count"
    

    #plt.bar(x_u, y_u, color="blue", edgecolor='black')
    #plt.title(title_u)
    #plt.xlabel(xlabel_u)
    #plt.ylabel(ylabel_u)
    #plt.show()